daihui.zhang
commited on
Commit
·
586518f
1
Parent(s):
a257a82
init whispercpp transcirption
Browse files- .gitignore +0 -1
- .python-version +1 -0
- config.py +19 -0
- pyproject.toml +19 -0
- requirements.txt +114 -0
- run_client.py +1 -1
- transcribe/__pycache__/__init__.cpython-311.pyc +0 -0
- transcribe/__pycache__/client.cpython-311.pyc +0 -0
- transcribe/__pycache__/server.cpython-311.pyc +0 -0
- transcribe/__pycache__/utils.cpython-311.pyc +0 -0
- transcribe/__pycache__/vad.cpython-311.pyc +0 -0
- transcribe/server/__init__.py +2 -0
- transcribe/{server.py → server/base.py} +14 -315
- transcribe/server/transcription.py +316 -0
- transcribe/server/whispercpp.py +326 -0
- uv.lock +0 -0
.gitignore
CHANGED
|
@@ -1,7 +1,6 @@
|
|
| 1 |
__pycache__/
|
| 2 |
*.py[cod]
|
| 3 |
*$py.class
|
| 4 |
-
|
| 5 |
# C extensions
|
| 6 |
*.so
|
| 7 |
|
|
|
|
| 1 |
__pycache__/
|
| 2 |
*.py[cod]
|
| 3 |
*$py.class
|
|
|
|
| 4 |
# C extensions
|
| 5 |
*.so
|
| 6 |
|
.python-version
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
3.11
|
config.py
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pathlib
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
BASE_DIR = pathlib.Path(__file__).parent
|
| 5 |
+
MODEL_DIR = BASE_DIR / "moyoyo_asr_models"
|
| 6 |
+
# 标点
|
| 7 |
+
SENTENCE_END_MARKERS = ['.', '!', '?', '。', '!', '?', ';', ';', ':', ':']
|
| 8 |
+
PAUSE_END_MARKERS = [',', ',', '、']
|
| 9 |
+
|
| 10 |
+
# whisper推理参数
|
| 11 |
+
WHISPER_PROMPT_ZH = "以下是简体中文普通话的句子。"
|
| 12 |
+
MAX_LENTH_ZH = 3
|
| 13 |
+
|
| 14 |
+
WHISPER_PROMPT_EN = "The following is an English sentence."
|
| 15 |
+
MAX_LENGTH_EN= 3
|
| 16 |
+
|
| 17 |
+
WHISPER_MODEL = 'medium-q5_0'
|
| 18 |
+
|
| 19 |
+
|
pyproject.toml
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[project]
|
| 2 |
+
name = "trans"
|
| 3 |
+
version = "0.1.0"
|
| 4 |
+
description = "Add your description here"
|
| 5 |
+
readme = "README.md"
|
| 6 |
+
requires-python = ">=3.11"
|
| 7 |
+
dependencies = [
|
| 8 |
+
"av>=14.2.0",
|
| 9 |
+
"librosa>=0.11.0",
|
| 10 |
+
"numpy>=2.1.3",
|
| 11 |
+
"onnxruntime>=1.21.0",
|
| 12 |
+
"pyaudio>=0.2.14",
|
| 13 |
+
"setuptools>=78.1.0",
|
| 14 |
+
"soundfile>=0.13.1",
|
| 15 |
+
"torch>=2.6.0",
|
| 16 |
+
"tqdm>=4.67.1",
|
| 17 |
+
"websocket-client>=1.8.0",
|
| 18 |
+
"websockets>=15.0.1",
|
| 19 |
+
]
|
requirements.txt
ADDED
|
@@ -0,0 +1,114 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# This file was autogenerated by uv via the following command:
|
| 2 |
+
# uv pip compile pyproject.toml -o requirements.txt
|
| 3 |
+
audioread==3.0.1
|
| 4 |
+
# via librosa
|
| 5 |
+
av==14.3.0
|
| 6 |
+
# via trans (pyproject.toml)
|
| 7 |
+
certifi==2025.1.31
|
| 8 |
+
# via requests
|
| 9 |
+
cffi==1.17.1
|
| 10 |
+
# via soundfile
|
| 11 |
+
charset-normalizer==3.4.1
|
| 12 |
+
# via requests
|
| 13 |
+
coloredlogs==15.0.1
|
| 14 |
+
# via onnxruntime
|
| 15 |
+
decorator==5.2.1
|
| 16 |
+
# via librosa
|
| 17 |
+
filelock==3.18.0
|
| 18 |
+
# via torch
|
| 19 |
+
flatbuffers==25.2.10
|
| 20 |
+
# via onnxruntime
|
| 21 |
+
fsspec==2025.3.2
|
| 22 |
+
# via torch
|
| 23 |
+
humanfriendly==10.0
|
| 24 |
+
# via coloredlogs
|
| 25 |
+
idna==3.10
|
| 26 |
+
# via requests
|
| 27 |
+
jinja2==3.1.6
|
| 28 |
+
# via torch
|
| 29 |
+
joblib==1.4.2
|
| 30 |
+
# via
|
| 31 |
+
# librosa
|
| 32 |
+
# scikit-learn
|
| 33 |
+
lazy-loader==0.4
|
| 34 |
+
# via librosa
|
| 35 |
+
librosa==0.11.0
|
| 36 |
+
# via trans (pyproject.toml)
|
| 37 |
+
llvmlite==0.44.0
|
| 38 |
+
# via numba
|
| 39 |
+
markupsafe==3.0.2
|
| 40 |
+
# via jinja2
|
| 41 |
+
mpmath==1.3.0
|
| 42 |
+
# via sympy
|
| 43 |
+
msgpack==1.1.0
|
| 44 |
+
# via librosa
|
| 45 |
+
networkx==3.4.2
|
| 46 |
+
# via torch
|
| 47 |
+
numba==0.61.0
|
| 48 |
+
# via librosa
|
| 49 |
+
numpy==2.1.3
|
| 50 |
+
# via
|
| 51 |
+
# trans (pyproject.toml)
|
| 52 |
+
# librosa
|
| 53 |
+
# numba
|
| 54 |
+
# onnxruntime
|
| 55 |
+
# scikit-learn
|
| 56 |
+
# scipy
|
| 57 |
+
# soundfile
|
| 58 |
+
# soxr
|
| 59 |
+
onnxruntime==1.21.0
|
| 60 |
+
# via trans (pyproject.toml)
|
| 61 |
+
packaging==24.2
|
| 62 |
+
# via
|
| 63 |
+
# lazy-loader
|
| 64 |
+
# onnxruntime
|
| 65 |
+
# pooch
|
| 66 |
+
platformdirs==4.3.7
|
| 67 |
+
# via pooch
|
| 68 |
+
pooch==1.8.2
|
| 69 |
+
# via librosa
|
| 70 |
+
protobuf==6.30.2
|
| 71 |
+
# via onnxruntime
|
| 72 |
+
pyaudio==0.2.14
|
| 73 |
+
# via trans (pyproject.toml)
|
| 74 |
+
pycparser==2.22
|
| 75 |
+
# via cffi
|
| 76 |
+
requests==2.32.3
|
| 77 |
+
# via pooch
|
| 78 |
+
scikit-learn==1.6.1
|
| 79 |
+
# via librosa
|
| 80 |
+
scipy==1.15.2
|
| 81 |
+
# via
|
| 82 |
+
# librosa
|
| 83 |
+
# scikit-learn
|
| 84 |
+
setuptools==78.1.0
|
| 85 |
+
# via trans (pyproject.toml)
|
| 86 |
+
soundfile==0.13.1
|
| 87 |
+
# via
|
| 88 |
+
# trans (pyproject.toml)
|
| 89 |
+
# librosa
|
| 90 |
+
soxr==0.5.0.post1
|
| 91 |
+
# via librosa
|
| 92 |
+
sympy==1.13.1
|
| 93 |
+
# via
|
| 94 |
+
# onnxruntime
|
| 95 |
+
# torch
|
| 96 |
+
threadpoolctl==3.6.0
|
| 97 |
+
# via scikit-learn
|
| 98 |
+
torch==2.6.0
|
| 99 |
+
# via trans (pyproject.toml)
|
| 100 |
+
tqdm==4.67.1
|
| 101 |
+
# via trans (pyproject.toml)
|
| 102 |
+
typing-extensions==4.13.1
|
| 103 |
+
# via
|
| 104 |
+
# librosa
|
| 105 |
+
# torch
|
| 106 |
+
urllib3==2.3.0
|
| 107 |
+
# via requests
|
| 108 |
+
websocket-client==1.8.0
|
| 109 |
+
# via trans (pyproject.toml)
|
| 110 |
+
websockets==15.0.1
|
| 111 |
+
# via trans (pyproject.toml)
|
| 112 |
+
|
| 113 |
+
llama-cpp-python
|
| 114 |
+
pywhispercpp
|
run_client.py
CHANGED
|
@@ -2,7 +2,7 @@ from transcribe.client import TranscriptionClient
|
|
| 2 |
|
| 3 |
client = TranscriptionClient(
|
| 4 |
"localhost",
|
| 5 |
-
|
| 6 |
lang="zh",
|
| 7 |
save_output_recording=False, # Only used for microphone input, False by Default
|
| 8 |
output_recording_filename="./output_recording.wav", # Only used for microphone input
|
|
|
|
| 2 |
|
| 3 |
client = TranscriptionClient(
|
| 4 |
"localhost",
|
| 5 |
+
9090,
|
| 6 |
lang="zh",
|
| 7 |
save_output_recording=False, # Only used for microphone input, False by Default
|
| 8 |
output_recording_filename="./output_recording.wav", # Only used for microphone input
|
transcribe/__pycache__/__init__.cpython-311.pyc
DELETED
|
Binary file (183 Bytes)
|
|
|
transcribe/__pycache__/client.cpython-311.pyc
DELETED
|
Binary file (39 kB)
|
|
|
transcribe/__pycache__/server.cpython-311.pyc
DELETED
|
Binary file (36 kB)
|
|
|
transcribe/__pycache__/utils.cpython-311.pyc
DELETED
|
Binary file (4.64 kB)
|
|
|
transcribe/__pycache__/vad.cpython-311.pyc
DELETED
|
Binary file (9.36 kB)
|
|
|
transcribe/server/__init__.py
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
from .transcription import TranscriptionServer
|
transcribe/{server.py → server/base.py}
RENAMED
|
@@ -1,324 +1,17 @@
|
|
| 1 |
-
|
| 2 |
import json
|
| 3 |
import logging
|
| 4 |
import pathlib
|
| 5 |
import threading
|
| 6 |
import time
|
| 7 |
-
|
| 8 |
-
from typing import List, Optional
|
| 9 |
-
|
| 10 |
import librosa
|
| 11 |
import numpy as np
|
| 12 |
import soundfile
|
| 13 |
from pywhispercpp.model import Model
|
| 14 |
-
from websockets.exceptions import ConnectionClosed
|
| 15 |
-
from websockets.sync.server import serve
|
| 16 |
-
|
| 17 |
-
from transcribe.vad import VoiceActivityDetector
|
| 18 |
|
| 19 |
logging.basicConfig(level=logging.INFO)
|
| 20 |
|
| 21 |
-
|
| 22 |
-
class ClientManager:
|
| 23 |
-
def __init__(self, max_clients=4, max_connection_time=600):
|
| 24 |
-
"""
|
| 25 |
-
Initializes the ClientManager with specified limits on client connections and connection durations.
|
| 26 |
-
|
| 27 |
-
Args:
|
| 28 |
-
max_clients (int, optional): The maximum number of simultaneous client connections allowed. Defaults to 4.
|
| 29 |
-
max_connection_time (int, optional): The maximum duration (in seconds) a client can stay connected. Defaults
|
| 30 |
-
to 600 seconds (10 minutes).
|
| 31 |
-
"""
|
| 32 |
-
self.clients = {}
|
| 33 |
-
self.start_times = {}
|
| 34 |
-
self.max_clients = max_clients
|
| 35 |
-
self.max_connection_time = max_connection_time
|
| 36 |
-
|
| 37 |
-
def add_client(self, websocket, client):
|
| 38 |
-
"""
|
| 39 |
-
Adds a client and their connection start time to the tracking dictionaries.
|
| 40 |
-
|
| 41 |
-
Args:
|
| 42 |
-
websocket: The websocket associated with the client to add.
|
| 43 |
-
client: The client object to be added and tracked.
|
| 44 |
-
"""
|
| 45 |
-
self.clients[websocket] = client
|
| 46 |
-
self.start_times[websocket] = time.time()
|
| 47 |
-
|
| 48 |
-
def get_client(self, websocket):
|
| 49 |
-
"""
|
| 50 |
-
Retrieves a client associated with the given websocket.
|
| 51 |
-
|
| 52 |
-
Args:
|
| 53 |
-
websocket: The websocket associated with the client to retrieve.
|
| 54 |
-
|
| 55 |
-
Returns:
|
| 56 |
-
The client object if found, False otherwise.
|
| 57 |
-
"""
|
| 58 |
-
if websocket in self.clients:
|
| 59 |
-
return self.clients[websocket]
|
| 60 |
-
return False
|
| 61 |
-
|
| 62 |
-
def remove_client(self, websocket):
|
| 63 |
-
"""
|
| 64 |
-
Removes a client and their connection start time from the tracking dictionaries. Performs cleanup on the
|
| 65 |
-
client if necessary.
|
| 66 |
-
|
| 67 |
-
Args:
|
| 68 |
-
websocket: The websocket associated with the client to be removed.
|
| 69 |
-
"""
|
| 70 |
-
client = self.clients.pop(websocket, None)
|
| 71 |
-
if client:
|
| 72 |
-
client.cleanup()
|
| 73 |
-
self.start_times.pop(websocket, None)
|
| 74 |
-
|
| 75 |
-
def get_wait_time(self):
|
| 76 |
-
"""
|
| 77 |
-
Calculates the estimated wait time for new clients based on the remaining connection times of current clients.
|
| 78 |
-
|
| 79 |
-
Returns:
|
| 80 |
-
The estimated wait time in minutes for new clients to connect. Returns 0 if there are available slots.
|
| 81 |
-
"""
|
| 82 |
-
wait_time = None
|
| 83 |
-
for start_time in self.start_times.values():
|
| 84 |
-
current_client_time_remaining = self.max_connection_time - (time.time() - start_time)
|
| 85 |
-
if wait_time is None or current_client_time_remaining < wait_time:
|
| 86 |
-
wait_time = current_client_time_remaining
|
| 87 |
-
return wait_time / 60 if wait_time is not None else 0
|
| 88 |
-
|
| 89 |
-
def is_server_full(self, websocket, options):
|
| 90 |
-
"""
|
| 91 |
-
Checks if the server is at its maximum client capacity and sends a wait message to the client if necessary.
|
| 92 |
-
|
| 93 |
-
Args:
|
| 94 |
-
websocket: The websocket of the client attempting to connect.
|
| 95 |
-
options: A dictionary of options that may include the client's unique identifier.
|
| 96 |
-
|
| 97 |
-
Returns:
|
| 98 |
-
True if the server is full, False otherwise.
|
| 99 |
-
"""
|
| 100 |
-
if len(self.clients) >= self.max_clients:
|
| 101 |
-
wait_time = self.get_wait_time()
|
| 102 |
-
response = {"uid": options["uid"], "status": "WAIT", "message": wait_time}
|
| 103 |
-
websocket.send(json.dumps(response))
|
| 104 |
-
return True
|
| 105 |
-
return False
|
| 106 |
-
|
| 107 |
-
def is_client_timeout(self, websocket):
|
| 108 |
-
"""
|
| 109 |
-
Checks if a client has exceeded the maximum allowed connection time and disconnects them if so, issuing a warning.
|
| 110 |
-
|
| 111 |
-
Args:
|
| 112 |
-
websocket: The websocket associated with the client to check.
|
| 113 |
-
|
| 114 |
-
Returns:
|
| 115 |
-
True if the client's connection time has exceeded the maximum limit, False otherwise.
|
| 116 |
-
"""
|
| 117 |
-
elapsed_time = time.time() - self.start_times[websocket]
|
| 118 |
-
if elapsed_time >= self.max_connection_time:
|
| 119 |
-
self.clients[websocket].disconnect()
|
| 120 |
-
logging.warning(f"Client with uid '{self.clients[websocket].client_uid}' disconnected due to overtime.")
|
| 121 |
-
return True
|
| 122 |
-
return False
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
class BackendType(Enum):
|
| 126 |
-
PYWHISPERCPP = "pywhispercpp"
|
| 127 |
-
|
| 128 |
-
@staticmethod
|
| 129 |
-
def valid_types() -> List[str]:
|
| 130 |
-
return [backend_type.value for backend_type in BackendType]
|
| 131 |
-
|
| 132 |
-
@staticmethod
|
| 133 |
-
def is_valid(backend: str) -> bool:
|
| 134 |
-
return backend in BackendType.valid_types()
|
| 135 |
-
|
| 136 |
-
def is_pywhispercpp(self) -> bool:
|
| 137 |
-
return self == BackendType.PYWHISPERCPP
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
class TranscriptionServer:
|
| 141 |
-
RATE = 16000
|
| 142 |
-
|
| 143 |
-
def __init__(self):
|
| 144 |
-
self.client_manager = None
|
| 145 |
-
self.no_voice_activity_chunks = 0
|
| 146 |
-
self.single_model = False
|
| 147 |
-
|
| 148 |
-
def initialize_client(
|
| 149 |
-
self, websocket, options
|
| 150 |
-
):
|
| 151 |
-
client: Optional[ServeClientBase] = None
|
| 152 |
-
|
| 153 |
-
if self.backend.is_pywhispercpp():
|
| 154 |
-
client = ServeClientWhisperCPP(
|
| 155 |
-
websocket,
|
| 156 |
-
language=options["language"],
|
| 157 |
-
client_uid=options["uid"],
|
| 158 |
-
single_model=self.single_model,
|
| 159 |
-
)
|
| 160 |
-
logging.info("Running pywhispercpp backend.")
|
| 161 |
-
|
| 162 |
-
if client is None:
|
| 163 |
-
raise ValueError(f"Backend type {self.backend.value} not recognised or not handled.")
|
| 164 |
-
|
| 165 |
-
self.client_manager.add_client(websocket, client)
|
| 166 |
-
|
| 167 |
-
def get_audio_from_websocket(self, websocket):
|
| 168 |
-
"""
|
| 169 |
-
Receives audio buffer from websocket and creates a numpy array out of it.
|
| 170 |
-
|
| 171 |
-
Args:
|
| 172 |
-
websocket: The websocket to receive audio from.
|
| 173 |
-
|
| 174 |
-
Returns:
|
| 175 |
-
A numpy array containing the audio.
|
| 176 |
-
"""
|
| 177 |
-
frame_data = websocket.recv()
|
| 178 |
-
if frame_data == b"END_OF_AUDIO":
|
| 179 |
-
return False
|
| 180 |
-
return np.frombuffer(frame_data, dtype=np.float32)
|
| 181 |
-
|
| 182 |
-
def handle_new_connection(self, websocket):
|
| 183 |
-
try:
|
| 184 |
-
logging.info("New client connected")
|
| 185 |
-
options = websocket.recv()
|
| 186 |
-
options = json.loads(options)
|
| 187 |
-
|
| 188 |
-
if self.client_manager is None:
|
| 189 |
-
max_clients = options.get('max_clients', 4)
|
| 190 |
-
max_connection_time = options.get('max_connection_time', 600)
|
| 191 |
-
self.client_manager = ClientManager(max_clients, max_connection_time)
|
| 192 |
-
|
| 193 |
-
if self.client_manager.is_server_full(websocket, options):
|
| 194 |
-
websocket.close()
|
| 195 |
-
return False # Indicates that the connection should not continue
|
| 196 |
-
|
| 197 |
-
if self.backend.is_pywhispercpp():
|
| 198 |
-
self.vad_detector = VoiceActivityDetector(frame_rate=self.RATE)
|
| 199 |
-
|
| 200 |
-
self.initialize_client(websocket, options)
|
| 201 |
-
|
| 202 |
-
return True
|
| 203 |
-
except json.JSONDecodeError:
|
| 204 |
-
logging.error("Failed to decode JSON from client")
|
| 205 |
-
return False
|
| 206 |
-
except ConnectionClosed:
|
| 207 |
-
logging.info("Connection closed by client")
|
| 208 |
-
return False
|
| 209 |
-
except Exception as e:
|
| 210 |
-
logging.error(f"Error during new connection initialization: {str(e)}")
|
| 211 |
-
return False
|
| 212 |
-
|
| 213 |
-
def process_audio_frames(self, websocket):
|
| 214 |
-
frame_np = self.get_audio_from_websocket(websocket)
|
| 215 |
-
client = self.client_manager.get_client(websocket)
|
| 216 |
-
|
| 217 |
-
# TODO Vad has some problem, it will be blocking process loop
|
| 218 |
-
# if frame_np is False:
|
| 219 |
-
# if self.backend.is_pywhispercpp():
|
| 220 |
-
# client.set_eos(True)
|
| 221 |
-
# return False
|
| 222 |
-
|
| 223 |
-
# if self.backend.is_pywhispercpp():
|
| 224 |
-
# voice_active = self.voice_activity(websocket, frame_np)
|
| 225 |
-
# if voice_active:
|
| 226 |
-
# self.no_voice_activity_chunks = 0
|
| 227 |
-
# client.set_eos(False)
|
| 228 |
-
# if self.use_vad and not voice_active:
|
| 229 |
-
# return True
|
| 230 |
-
|
| 231 |
-
client.add_frames(frame_np)
|
| 232 |
-
return True
|
| 233 |
-
|
| 234 |
-
def recv_audio(self,
|
| 235 |
-
websocket,
|
| 236 |
-
backend: BackendType = BackendType.PYWHISPERCPP):
|
| 237 |
-
|
| 238 |
-
self.backend = backend
|
| 239 |
-
if not self.handle_new_connection(websocket):
|
| 240 |
-
return
|
| 241 |
-
|
| 242 |
-
try:
|
| 243 |
-
while not self.client_manager.is_client_timeout(websocket):
|
| 244 |
-
if not self.process_audio_frames(websocket):
|
| 245 |
-
break
|
| 246 |
-
except ConnectionClosed:
|
| 247 |
-
logging.info("Connection closed by client")
|
| 248 |
-
except Exception as e:
|
| 249 |
-
logging.error(f"Unexpected error: {str(e)}")
|
| 250 |
-
finally:
|
| 251 |
-
if self.client_manager.get_client(websocket):
|
| 252 |
-
self.cleanup(websocket)
|
| 253 |
-
websocket.close()
|
| 254 |
-
del websocket
|
| 255 |
-
|
| 256 |
-
def run(self,
|
| 257 |
-
host,
|
| 258 |
-
port=9090,
|
| 259 |
-
backend="pywhispercpp"):
|
| 260 |
-
"""
|
| 261 |
-
Run the transcription server.
|
| 262 |
-
|
| 263 |
-
Args:
|
| 264 |
-
host (str): The host address to bind the server.
|
| 265 |
-
port (int): The port number to bind the server.
|
| 266 |
-
"""
|
| 267 |
-
|
| 268 |
-
if not BackendType.is_valid(backend):
|
| 269 |
-
raise ValueError(f"{backend} is not a valid backend type. Choose backend from {BackendType.valid_types()}")
|
| 270 |
-
|
| 271 |
-
with serve(
|
| 272 |
-
functools.partial(
|
| 273 |
-
self.recv_audio,
|
| 274 |
-
backend=BackendType(backend),
|
| 275 |
-
),
|
| 276 |
-
host,
|
| 277 |
-
port
|
| 278 |
-
) as server:
|
| 279 |
-
server.serve_forever()
|
| 280 |
-
|
| 281 |
-
def voice_activity(self, websocket, frame_np):
|
| 282 |
-
"""
|
| 283 |
-
Evaluates the voice activity in a given audio frame and manages the state of voice activity detection.
|
| 284 |
-
|
| 285 |
-
This method uses the configured voice activity detection (VAD) model to assess whether the given audio frame
|
| 286 |
-
contains speech. If the VAD model detects no voice activity for more than three consecutive frames,
|
| 287 |
-
it sets an end-of-speech (EOS) flag for the associated client. This method aims to efficiently manage
|
| 288 |
-
speech detection to improve subsequent processing steps.
|
| 289 |
-
|
| 290 |
-
Args:
|
| 291 |
-
websocket: The websocket associated with the current client. Used to retrieve the client object
|
| 292 |
-
from the client manager for state management.
|
| 293 |
-
frame_np (numpy.ndarray): The audio frame to be analyzed. This should be a NumPy array containing
|
| 294 |
-
the audio data for the current frame.
|
| 295 |
-
|
| 296 |
-
Returns:
|
| 297 |
-
bool: True if voice activity is detected in the current frame, False otherwise. When returning False
|
| 298 |
-
after detecting no voice activity for more than three consecutive frames, it also triggers the
|
| 299 |
-
end-of-speech (EOS) flag for the client.
|
| 300 |
-
"""
|
| 301 |
-
if not self.vad_detector(frame_np):
|
| 302 |
-
self.no_voice_activity_chunks += 1
|
| 303 |
-
if self.no_voice_activity_chunks > 3:
|
| 304 |
-
client = self.client_manager.get_client(websocket)
|
| 305 |
-
if not client.eos:
|
| 306 |
-
client.set_eos(True)
|
| 307 |
-
time.sleep(0.1) # Sleep 100m; wait some voice activity.
|
| 308 |
-
return False
|
| 309 |
-
return True
|
| 310 |
-
|
| 311 |
-
def cleanup(self, websocket):
|
| 312 |
-
"""
|
| 313 |
-
Cleans up resources associated with a given client's websocket.
|
| 314 |
-
|
| 315 |
-
Args:
|
| 316 |
-
websocket: The websocket associated with the client to be cleaned up.
|
| 317 |
-
"""
|
| 318 |
-
if self.client_manager.get_client(websocket):
|
| 319 |
-
self.client_manager.remove_client(websocket)
|
| 320 |
-
|
| 321 |
-
|
| 322 |
class ServeClientBase(object):
|
| 323 |
RATE = 16000
|
| 324 |
SERVER_READY = "SERVER_READY"
|
|
@@ -442,6 +135,7 @@ class ServeClientBase(object):
|
|
| 442 |
segments = self.transcript.copy()
|
| 443 |
if last_segment is not None:
|
| 444 |
segments = segments + [last_segment]
|
|
|
|
| 445 |
return segments
|
| 446 |
|
| 447 |
def get_audio_chunk_duration(self, input_bytes):
|
|
@@ -549,10 +243,8 @@ class ServeClientWhisperCPP(ServeClientBase):
|
|
| 549 |
"""
|
| 550 |
Instantiates a new model, sets it as the transcriber and does warmup if desired.
|
| 551 |
"""
|
| 552 |
-
|
| 553 |
-
|
| 554 |
-
models_dir = f'{here.parent.parent / "moyoyo_asr_models"}'
|
| 555 |
-
self.transcriber = Model(model=model, models_dir=models_dir)
|
| 556 |
if warmup:
|
| 557 |
self.warmup()
|
| 558 |
|
|
@@ -610,8 +302,15 @@ class ServeClientWhisperCPP(ServeClientBase):
|
|
| 610 |
prompt = '以下是简体中文普通话的句子。'
|
| 611 |
else:
|
| 612 |
prompt = 'The following is an English sentence.'
|
| 613 |
-
|
| 614 |
-
segments = self.transcriber.transcribe(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 615 |
text = []
|
| 616 |
for segment in segments:
|
| 617 |
content = segment.text
|
|
|
|
| 1 |
+
|
| 2 |
import json
|
| 3 |
import logging
|
| 4 |
import pathlib
|
| 5 |
import threading
|
| 6 |
import time
|
| 7 |
+
import config
|
|
|
|
|
|
|
| 8 |
import librosa
|
| 9 |
import numpy as np
|
| 10 |
import soundfile
|
| 11 |
from pywhispercpp.model import Model
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
|
| 13 |
logging.basicConfig(level=logging.INFO)
|
| 14 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
class ServeClientBase(object):
|
| 16 |
RATE = 16000
|
| 17 |
SERVER_READY = "SERVER_READY"
|
|
|
|
| 135 |
segments = self.transcript.copy()
|
| 136 |
if last_segment is not None:
|
| 137 |
segments = segments + [last_segment]
|
| 138 |
+
logging.info(f"{segments}")
|
| 139 |
return segments
|
| 140 |
|
| 141 |
def get_audio_chunk_duration(self, input_bytes):
|
|
|
|
| 243 |
"""
|
| 244 |
Instantiates a new model, sets it as the transcriber and does warmup if desired.
|
| 245 |
"""
|
| 246 |
+
|
| 247 |
+
self.transcriber = Model(model=config.WHISPER_MODEL, models_dir=config.MODEL_DIR)
|
|
|
|
|
|
|
| 248 |
if warmup:
|
| 249 |
self.warmup()
|
| 250 |
|
|
|
|
| 302 |
prompt = '以下是简体中文普通话的句子。'
|
| 303 |
else:
|
| 304 |
prompt = 'The following is an English sentence.'
|
| 305 |
+
|
| 306 |
+
segments = self.transcriber.transcribe(
|
| 307 |
+
mel,
|
| 308 |
+
language=self.language,
|
| 309 |
+
initial_prompt=prompt,
|
| 310 |
+
token_timestamps=True,
|
| 311 |
+
# max_len=max_len,
|
| 312 |
+
print_progress=False
|
| 313 |
+
)
|
| 314 |
text = []
|
| 315 |
for segment in segments:
|
| 316 |
content = segment.text
|
transcribe/server/transcription.py
ADDED
|
@@ -0,0 +1,316 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
import time
|
| 3 |
+
import functools
|
| 4 |
+
import json
|
| 5 |
+
import logging
|
| 6 |
+
import time
|
| 7 |
+
from enum import Enum
|
| 8 |
+
from typing import List, Optional
|
| 9 |
+
import numpy as np
|
| 10 |
+
from .base import ServeClientBase, ServeClientWhisperCPP
|
| 11 |
+
from .whispercpp import PyWhiperCppServe
|
| 12 |
+
from ..vad import VoiceActivityDetector
|
| 13 |
+
from websockets.exceptions import ConnectionClosed
|
| 14 |
+
from websockets.sync.server import serve
|
| 15 |
+
|
| 16 |
+
logging.basicConfig(level=logging.INFO)
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
class ClientManager:
|
| 20 |
+
def __init__(self, max_clients=4, max_connection_time=600):
|
| 21 |
+
"""
|
| 22 |
+
Initializes the ClientManager with specified limits on client connections and connection durations.
|
| 23 |
+
|
| 24 |
+
Args:
|
| 25 |
+
max_clients (int, optional): The maximum number of simultaneous client connections allowed. Defaults to 4.
|
| 26 |
+
max_connection_time (int, optional): The maximum duration (in seconds) a client can stay connected. Defaults
|
| 27 |
+
to 600 seconds (10 minutes).
|
| 28 |
+
"""
|
| 29 |
+
self.clients = {}
|
| 30 |
+
self.start_times = {}
|
| 31 |
+
self.max_clients = max_clients
|
| 32 |
+
self.max_connection_time = max_connection_time
|
| 33 |
+
|
| 34 |
+
def add_client(self, websocket, client):
|
| 35 |
+
"""
|
| 36 |
+
Adds a client and their connection start time to the tracking dictionaries.
|
| 37 |
+
|
| 38 |
+
Args:
|
| 39 |
+
websocket: The websocket associated with the client to add.
|
| 40 |
+
client: The client object to be added and tracked.
|
| 41 |
+
"""
|
| 42 |
+
self.clients[websocket] = client
|
| 43 |
+
self.start_times[websocket] = time.time()
|
| 44 |
+
|
| 45 |
+
def get_client(self, websocket):
|
| 46 |
+
"""
|
| 47 |
+
Retrieves a client associated with the given websocket.
|
| 48 |
+
|
| 49 |
+
Args:
|
| 50 |
+
websocket: The websocket associated with the client to retrieve.
|
| 51 |
+
|
| 52 |
+
Returns:
|
| 53 |
+
The client object if found, False otherwise.
|
| 54 |
+
"""
|
| 55 |
+
if websocket in self.clients:
|
| 56 |
+
return self.clients[websocket]
|
| 57 |
+
return False
|
| 58 |
+
|
| 59 |
+
def remove_client(self, websocket):
|
| 60 |
+
"""
|
| 61 |
+
Removes a client and their connection start time from the tracking dictionaries. Performs cleanup on the
|
| 62 |
+
client if necessary.
|
| 63 |
+
|
| 64 |
+
Args:
|
| 65 |
+
websocket: The websocket associated with the client to be removed.
|
| 66 |
+
"""
|
| 67 |
+
client = self.clients.pop(websocket, None)
|
| 68 |
+
if client:
|
| 69 |
+
client.cleanup()
|
| 70 |
+
self.start_times.pop(websocket, None)
|
| 71 |
+
|
| 72 |
+
def get_wait_time(self):
|
| 73 |
+
"""
|
| 74 |
+
Calculates the estimated wait time for new clients based on the remaining connection times of current clients.
|
| 75 |
+
|
| 76 |
+
Returns:
|
| 77 |
+
The estimated wait time in minutes for new clients to connect. Returns 0 if there are available slots.
|
| 78 |
+
"""
|
| 79 |
+
wait_time = None
|
| 80 |
+
for start_time in self.start_times.values():
|
| 81 |
+
current_client_time_remaining = self.max_connection_time - (time.time() - start_time)
|
| 82 |
+
if wait_time is None or current_client_time_remaining < wait_time:
|
| 83 |
+
wait_time = current_client_time_remaining
|
| 84 |
+
return wait_time / 60 if wait_time is not None else 0
|
| 85 |
+
|
| 86 |
+
def is_server_full(self, websocket, options):
|
| 87 |
+
"""
|
| 88 |
+
Checks if the server is at its maximum client capacity and sends a wait message to the client if necessary.
|
| 89 |
+
|
| 90 |
+
Args:
|
| 91 |
+
websocket: The websocket of the client attempting to connect.
|
| 92 |
+
options: A dictionary of options that may include the client's unique identifier.
|
| 93 |
+
|
| 94 |
+
Returns:
|
| 95 |
+
True if the server is full, False otherwise.
|
| 96 |
+
"""
|
| 97 |
+
if len(self.clients) >= self.max_clients:
|
| 98 |
+
wait_time = self.get_wait_time()
|
| 99 |
+
response = {"uid": options["uid"], "status": "WAIT", "message": wait_time}
|
| 100 |
+
websocket.send(json.dumps(response))
|
| 101 |
+
return True
|
| 102 |
+
return False
|
| 103 |
+
|
| 104 |
+
def is_client_timeout(self, websocket):
|
| 105 |
+
"""
|
| 106 |
+
Checks if a client has exceeded the maximum allowed connection time and disconnects them if so, issuing a warning.
|
| 107 |
+
|
| 108 |
+
Args:
|
| 109 |
+
websocket: The websocket associated with the client to check.
|
| 110 |
+
|
| 111 |
+
Returns:
|
| 112 |
+
True if the client's connection time has exceeded the maximum limit, False otherwise.
|
| 113 |
+
"""
|
| 114 |
+
elapsed_time = time.time() - self.start_times[websocket]
|
| 115 |
+
if elapsed_time >= self.max_connection_time:
|
| 116 |
+
self.clients[websocket].disconnect()
|
| 117 |
+
logging.warning(f"Client with uid '{self.clients[websocket].client_uid}' disconnected due to overtime.")
|
| 118 |
+
return True
|
| 119 |
+
return False
|
| 120 |
+
|
| 121 |
+
|
| 122 |
+
class BackendType(Enum):
|
| 123 |
+
PYWHISPERCPP = "pywhispercpp"
|
| 124 |
+
|
| 125 |
+
@staticmethod
|
| 126 |
+
def valid_types() -> List[str]:
|
| 127 |
+
return [backend_type.value for backend_type in BackendType]
|
| 128 |
+
|
| 129 |
+
@staticmethod
|
| 130 |
+
def is_valid(backend: str) -> bool:
|
| 131 |
+
return backend in BackendType.valid_types()
|
| 132 |
+
|
| 133 |
+
def is_pywhispercpp(self) -> bool:
|
| 134 |
+
return self == BackendType.PYWHISPERCPP
|
| 135 |
+
|
| 136 |
+
|
| 137 |
+
class TranscriptionServer:
|
| 138 |
+
RATE = 16000
|
| 139 |
+
|
| 140 |
+
def __init__(self):
|
| 141 |
+
self.client_manager = None
|
| 142 |
+
self.no_voice_activity_chunks = 0
|
| 143 |
+
self.single_model = False
|
| 144 |
+
|
| 145 |
+
def initialize_client(
|
| 146 |
+
self, websocket, options
|
| 147 |
+
):
|
| 148 |
+
client: Optional[ServeClientBase] = None
|
| 149 |
+
|
| 150 |
+
if self.backend.is_pywhispercpp():
|
| 151 |
+
client = PyWhiperCppServe(
|
| 152 |
+
websocket,
|
| 153 |
+
language=options["language"],
|
| 154 |
+
client_uid=options["uid"],
|
| 155 |
+
)
|
| 156 |
+
logging.info("Running pywhispercpp backend.")
|
| 157 |
+
|
| 158 |
+
if client is None:
|
| 159 |
+
raise ValueError(f"Backend type {self.backend.value} not recognised or not handled.")
|
| 160 |
+
|
| 161 |
+
self.client_manager.add_client(websocket, client)
|
| 162 |
+
|
| 163 |
+
def get_audio_from_websocket(self, websocket):
|
| 164 |
+
"""
|
| 165 |
+
Receives audio buffer from websocket and creates a numpy array out of it.
|
| 166 |
+
|
| 167 |
+
Args:
|
| 168 |
+
websocket: The websocket to receive audio from.
|
| 169 |
+
|
| 170 |
+
Returns:
|
| 171 |
+
A numpy array containing the audio.
|
| 172 |
+
"""
|
| 173 |
+
frame_data = websocket.recv()
|
| 174 |
+
if frame_data == b"END_OF_AUDIO":
|
| 175 |
+
return False
|
| 176 |
+
return np.frombuffer(frame_data, dtype=np.float32)
|
| 177 |
+
|
| 178 |
+
def handle_new_connection(self, websocket):
|
| 179 |
+
try:
|
| 180 |
+
logging.info("New client connected")
|
| 181 |
+
options = websocket.recv()
|
| 182 |
+
options = json.loads(options)
|
| 183 |
+
|
| 184 |
+
if self.client_manager is None:
|
| 185 |
+
max_clients = options.get('max_clients', 4)
|
| 186 |
+
max_connection_time = options.get('max_connection_time', 600)
|
| 187 |
+
self.client_manager = ClientManager(max_clients, max_connection_time)
|
| 188 |
+
|
| 189 |
+
if self.client_manager.is_server_full(websocket, options):
|
| 190 |
+
websocket.close()
|
| 191 |
+
return False # Indicates that the connection should not continue
|
| 192 |
+
|
| 193 |
+
if self.backend.is_pywhispercpp():
|
| 194 |
+
self.vad_detector = VoiceActivityDetector(frame_rate=self.RATE)
|
| 195 |
+
|
| 196 |
+
self.initialize_client(websocket, options)
|
| 197 |
+
|
| 198 |
+
return True
|
| 199 |
+
except json.JSONDecodeError:
|
| 200 |
+
logging.error("Failed to decode JSON from client")
|
| 201 |
+
return False
|
| 202 |
+
except ConnectionClosed:
|
| 203 |
+
logging.info("Connection closed by client")
|
| 204 |
+
return False
|
| 205 |
+
except Exception as e:
|
| 206 |
+
logging.error(f"Error during new connection initialization: {str(e)}")
|
| 207 |
+
return False
|
| 208 |
+
|
| 209 |
+
def process_audio_frames(self, websocket):
|
| 210 |
+
frame_np = self.get_audio_from_websocket(websocket)
|
| 211 |
+
client = self.client_manager.get_client(websocket)
|
| 212 |
+
|
| 213 |
+
# TODO Vad has some problem, it will be blocking process loop
|
| 214 |
+
# if frame_np is False:
|
| 215 |
+
# if self.backend.is_pywhispercpp():
|
| 216 |
+
# client.set_eos(True)
|
| 217 |
+
# return False
|
| 218 |
+
|
| 219 |
+
# if self.backend.is_pywhispercpp():
|
| 220 |
+
# voice_active = self.voice_activity(websocket, frame_np)
|
| 221 |
+
# if voice_active:
|
| 222 |
+
# self.no_voice_activity_chunks = 0
|
| 223 |
+
# client.set_eos(False)
|
| 224 |
+
# if self.use_vad and not voice_active:
|
| 225 |
+
# return True
|
| 226 |
+
|
| 227 |
+
client.add_frames(frame_np)
|
| 228 |
+
return True
|
| 229 |
+
|
| 230 |
+
def recv_audio(self,
|
| 231 |
+
websocket,
|
| 232 |
+
backend: BackendType = BackendType.PYWHISPERCPP):
|
| 233 |
+
|
| 234 |
+
self.backend = backend
|
| 235 |
+
if not self.handle_new_connection(websocket):
|
| 236 |
+
return
|
| 237 |
+
|
| 238 |
+
try:
|
| 239 |
+
while not self.client_manager.is_client_timeout(websocket):
|
| 240 |
+
if not self.process_audio_frames(websocket):
|
| 241 |
+
break
|
| 242 |
+
except ConnectionClosed:
|
| 243 |
+
logging.info("Connection closed by client")
|
| 244 |
+
except Exception as e:
|
| 245 |
+
logging.error(f"Unexpected error: {str(e)}")
|
| 246 |
+
finally:
|
| 247 |
+
if self.client_manager.get_client(websocket):
|
| 248 |
+
self.cleanup(websocket)
|
| 249 |
+
websocket.close()
|
| 250 |
+
del websocket
|
| 251 |
+
|
| 252 |
+
def run(self,
|
| 253 |
+
host,
|
| 254 |
+
port=9090,
|
| 255 |
+
backend="pywhispercpp"):
|
| 256 |
+
"""
|
| 257 |
+
Run the transcription server.
|
| 258 |
+
|
| 259 |
+
Args:
|
| 260 |
+
host (str): The host address to bind the server.
|
| 261 |
+
port (int): The port number to bind the server.
|
| 262 |
+
"""
|
| 263 |
+
|
| 264 |
+
if not BackendType.is_valid(backend):
|
| 265 |
+
raise ValueError(f"{backend} is not a valid backend type. Choose backend from {BackendType.valid_types()}")
|
| 266 |
+
|
| 267 |
+
with serve(
|
| 268 |
+
functools.partial(
|
| 269 |
+
self.recv_audio,
|
| 270 |
+
backend=BackendType(backend),
|
| 271 |
+
),
|
| 272 |
+
host,
|
| 273 |
+
port
|
| 274 |
+
) as server:
|
| 275 |
+
server.serve_forever()
|
| 276 |
+
|
| 277 |
+
def voice_activity(self, websocket, frame_np):
|
| 278 |
+
"""
|
| 279 |
+
Evaluates the voice activity in a given audio frame and manages the state of voice activity detection.
|
| 280 |
+
|
| 281 |
+
This method uses the configured voice activity detection (VAD) model to assess whether the given audio frame
|
| 282 |
+
contains speech. If the VAD model detects no voice activity for more than three consecutive frames,
|
| 283 |
+
it sets an end-of-speech (EOS) flag for the associated client. This method aims to efficiently manage
|
| 284 |
+
speech detection to improve subsequent processing steps.
|
| 285 |
+
|
| 286 |
+
Args:
|
| 287 |
+
websocket: The websocket associated with the current client. Used to retrieve the client object
|
| 288 |
+
from the client manager for state management.
|
| 289 |
+
frame_np (numpy.ndarray): The audio frame to be analyzed. This should be a NumPy array containing
|
| 290 |
+
the audio data for the current frame.
|
| 291 |
+
|
| 292 |
+
Returns:
|
| 293 |
+
bool: True if voice activity is detected in the current frame, False otherwise. When returning False
|
| 294 |
+
after detecting no voice activity for more than three consecutive frames, it also triggers the
|
| 295 |
+
end-of-speech (EOS) flag for the client.
|
| 296 |
+
"""
|
| 297 |
+
if not self.vad_detector(frame_np):
|
| 298 |
+
self.no_voice_activity_chunks += 1
|
| 299 |
+
if self.no_voice_activity_chunks > 3:
|
| 300 |
+
client = self.client_manager.get_client(websocket)
|
| 301 |
+
if not client.eos:
|
| 302 |
+
client.set_eos(True)
|
| 303 |
+
time.sleep(0.1) # Sleep 100m; wait some voice activity.
|
| 304 |
+
return False
|
| 305 |
+
return True
|
| 306 |
+
|
| 307 |
+
def cleanup(self, websocket):
|
| 308 |
+
"""
|
| 309 |
+
Cleans up resources associated with a given client's websocket.
|
| 310 |
+
|
| 311 |
+
Args:
|
| 312 |
+
websocket: The websocket associated with the client to be cleaned up.
|
| 313 |
+
"""
|
| 314 |
+
if self.client_manager.get_client(websocket):
|
| 315 |
+
self.client_manager.remove_client(websocket)
|
| 316 |
+
|
transcribe/server/whispercpp.py
ADDED
|
@@ -0,0 +1,326 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
from .base import ServeClientBase
|
| 3 |
+
from pywhispercpp.model import Model
|
| 4 |
+
import soundfile
|
| 5 |
+
from concurrent.futures import ProcessPoolExecutor as Pool
|
| 6 |
+
import numpy as np
|
| 7 |
+
from logging import getLogger
|
| 8 |
+
from difflib import SequenceMatcher
|
| 9 |
+
import collections
|
| 10 |
+
import config
|
| 11 |
+
import time
|
| 12 |
+
import json
|
| 13 |
+
import threading
|
| 14 |
+
|
| 15 |
+
logger = getLogger("Pywhisper")
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
class TripleTextBuffer:
|
| 19 |
+
def __init__(self, size=2):
|
| 20 |
+
self.history = collections.deque(maxlen=size)
|
| 21 |
+
|
| 22 |
+
def _clean(self):
|
| 23 |
+
self.history.clear()
|
| 24 |
+
|
| 25 |
+
def add_entry(self, text, index):
|
| 26 |
+
"""
|
| 27 |
+
text: 文本
|
| 28 |
+
index: 当前buffer的相对下标 数组索引
|
| 29 |
+
"""
|
| 30 |
+
self.history.append((text, index))
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
def get_final_index(self, similarity_threshold=0.7):
|
| 34 |
+
"""根据文本变化,返回可靠的标点的buffer的位置下标"""
|
| 35 |
+
if len(self.history) < 2:
|
| 36 |
+
return None
|
| 37 |
+
|
| 38 |
+
# 获取三次的文本
|
| 39 |
+
text1, _ = self.history[0]
|
| 40 |
+
text2, idx2 = self.history[1]
|
| 41 |
+
# text3, idx3 = self.history[2]
|
| 42 |
+
|
| 43 |
+
# 计算变化程度
|
| 44 |
+
sim_12 = self.text_similarity(text1, text2)
|
| 45 |
+
# print("比较: ", text1, text2," => ", sim_12)
|
| 46 |
+
# sim_23 = self.text_similarity(text2, text3)
|
| 47 |
+
if sim_12 >= similarity_threshold:
|
| 48 |
+
self._clean()
|
| 49 |
+
return idx2
|
| 50 |
+
return None
|
| 51 |
+
|
| 52 |
+
@staticmethod
|
| 53 |
+
def text_similarity(text1, text2):
|
| 54 |
+
return SequenceMatcher(None, text1, text2).ratio()
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
class SegmentManager:
|
| 59 |
+
def __init__(self) -> None:
|
| 60 |
+
self._commited_segments = [] # 确定后的段落
|
| 61 |
+
self._commited_short_sentences = [] # 确定后的序列
|
| 62 |
+
self._temp_string = "" # 存储当前临时的文本字符串,直到以句号结尾
|
| 63 |
+
|
| 64 |
+
def handle(self, string):
|
| 65 |
+
self._temp_string = string
|
| 66 |
+
return self
|
| 67 |
+
|
| 68 |
+
@property
|
| 69 |
+
def short_sentence(self) -> str:
|
| 70 |
+
return "".join(self._commited_short_sentences)
|
| 71 |
+
|
| 72 |
+
@property
|
| 73 |
+
def segment(self):
|
| 74 |
+
return self._commited_segments[-1] if len(self._commited_segments) > 0 else ""
|
| 75 |
+
|
| 76 |
+
def get_seg_id(self):
|
| 77 |
+
return len(self._commited_segments)
|
| 78 |
+
|
| 79 |
+
@property
|
| 80 |
+
def string(self):
|
| 81 |
+
return self._temp_string
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
def commit_short_sentence(self):
|
| 85 |
+
"""将临时字符串 提交到临时短句"""
|
| 86 |
+
self._commited_short_sentences.append(self._temp_string)
|
| 87 |
+
self._temp_string = ""
|
| 88 |
+
|
| 89 |
+
def commit_segment(self):
|
| 90 |
+
"""将短句 合并 到长句中"""
|
| 91 |
+
self._commited_segments.append(self.short_sentence)
|
| 92 |
+
self._commited_short_sentences = []
|
| 93 |
+
|
| 94 |
+
def commit(self, is_end_sentence=False):
|
| 95 |
+
"""
|
| 96 |
+
当需要切掉的音频部分的时候,将句子提交到短句队列中,并移除临时字符串
|
| 97 |
+
当完成一个整句的时候提交到段落中
|
| 98 |
+
"""
|
| 99 |
+
self.commit_short_sentence()
|
| 100 |
+
if is_end_sentence:
|
| 101 |
+
self.commit_segment()
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
class PywhisperInference:
|
| 105 |
+
|
| 106 |
+
model = None
|
| 107 |
+
|
| 108 |
+
@classmethod
|
| 109 |
+
def initializer(cls, warmup=True):
|
| 110 |
+
models_dir = config.MODEL_DIR.as_posix()
|
| 111 |
+
cls.model = Model(
|
| 112 |
+
model=config.WHISPER_MODEL,
|
| 113 |
+
models_dir=models_dir,
|
| 114 |
+
print_realtime=False,
|
| 115 |
+
print_progress=False,
|
| 116 |
+
print_timestamps=False,
|
| 117 |
+
)
|
| 118 |
+
if warmup:
|
| 119 |
+
cls.warmup()
|
| 120 |
+
|
| 121 |
+
@classmethod
|
| 122 |
+
def warmup(cls, warmup_steps=1):
|
| 123 |
+
mel, _, = soundfile.read("assets/jfk.flac")
|
| 124 |
+
for _ in range(warmup_steps):
|
| 125 |
+
cls.model.transcribe(mel, print_progress=False)
|
| 126 |
+
|
| 127 |
+
@staticmethod
|
| 128 |
+
def config_language(language):
|
| 129 |
+
if language == "zh":
|
| 130 |
+
return config.MAX_LENTH_ZH, config.WHISPER_PROMPT_ZH
|
| 131 |
+
elif language == "en":
|
| 132 |
+
return config.MAX_LENTH_ZH, config.WHISPER_PROMPT_ZH
|
| 133 |
+
raise ValueError(f"Unsupported language : {language}")
|
| 134 |
+
|
| 135 |
+
@classmethod
|
| 136 |
+
def inference(cls, audio_buffer, language):
|
| 137 |
+
max_len, prompt = cls.config_language(language)
|
| 138 |
+
audio_buffer = np.frombuffer(audio_buffer, dtype=np.float32)
|
| 139 |
+
return cls.model.transcribe(
|
| 140 |
+
audio_buffer,
|
| 141 |
+
initial_prompt=prompt,
|
| 142 |
+
language=language,
|
| 143 |
+
token_timestamps=True,
|
| 144 |
+
max_len=max_len
|
| 145 |
+
)
|
| 146 |
+
|
| 147 |
+
|
| 148 |
+
class PyWhiperCppServe(ServeClientBase):
|
| 149 |
+
|
| 150 |
+
|
| 151 |
+
def __init__(self, websocket, language=None, client_uid=None,):
|
| 152 |
+
super().__init__(client_uid, websocket)
|
| 153 |
+
self.language = language
|
| 154 |
+
# 设置观察字符串 对比上下次的文字来判断字符串的输出是否固定
|
| 155 |
+
self._text_buffer = TripleTextBuffer()
|
| 156 |
+
# 存储转录数据
|
| 157 |
+
self._segment_manager = SegmentManager()
|
| 158 |
+
self.lock = threading.Lock()
|
| 159 |
+
self.frames_np = None
|
| 160 |
+
self.sample_rate = 16000
|
| 161 |
+
|
| 162 |
+
self._pool = Pool(
|
| 163 |
+
max_workers=1, initializer=PywhisperInference.initializer)
|
| 164 |
+
|
| 165 |
+
logger.info('Create a thread to process audio.')
|
| 166 |
+
self.trans_thread = threading.Thread(target=self.speech_to_text)
|
| 167 |
+
self.trans_thread.start()
|
| 168 |
+
|
| 169 |
+
self.websocket.send(json.dumps({
|
| 170 |
+
"uid": self.client_uid,
|
| 171 |
+
"message": self.SERVER_READY,
|
| 172 |
+
"backend": "pywhispercpp"
|
| 173 |
+
}))
|
| 174 |
+
|
| 175 |
+
def add_frames(self, frame_np):
|
| 176 |
+
with self.lock:
|
| 177 |
+
if self.frames_np is None:
|
| 178 |
+
self.frames_np = frame_np.copy()
|
| 179 |
+
else:
|
| 180 |
+
self.frames_np = np.append(self.frames_np,frame_np)
|
| 181 |
+
|
| 182 |
+
|
| 183 |
+
def update_audio_buffer(self, last_offset):
|
| 184 |
+
with self.lock:
|
| 185 |
+
self.frames_np = self.frames_np[last_offset:]
|
| 186 |
+
|
| 187 |
+
def transcribe_audio(self, audio_buffer):
|
| 188 |
+
"""
|
| 189 |
+
Transcribe the audio chunk and send the results to the client.
|
| 190 |
+
|
| 191 |
+
Args:
|
| 192 |
+
audio_buffer (np.array): The audio chunk to transcribe.
|
| 193 |
+
"""
|
| 194 |
+
fut = self._pool.submit(
|
| 195 |
+
PywhisperInference.inference, audio_buffer.tobytes(), self.language)
|
| 196 |
+
return fut.result()
|
| 197 |
+
|
| 198 |
+
|
| 199 |
+
|
| 200 |
+
def _segments_split(self, segments, audio_buffer: np.ndarray):
|
| 201 |
+
"""根据左边第一个标点符号来将序列拆分成 观察段 和 剩余部分"""
|
| 202 |
+
left_watch_sequences = []
|
| 203 |
+
left_watch_idx = 0
|
| 204 |
+
right_watch_sequences = []
|
| 205 |
+
is_end = False
|
| 206 |
+
|
| 207 |
+
if (len(audio_buffer) / self.sample_rate) < 10:
|
| 208 |
+
# 低于10s 使用短句符号比如逗号作为判断依据
|
| 209 |
+
markers = config.PAUSE_END_MARKERS
|
| 210 |
+
is_end = False
|
| 211 |
+
else:
|
| 212 |
+
# 使用句号 问好等长句结尾符号作为判断
|
| 213 |
+
markers = config.SENTENCE_END_MARKERS
|
| 214 |
+
is_end = True
|
| 215 |
+
|
| 216 |
+
|
| 217 |
+
for idx, seg in enumerate(segments):
|
| 218 |
+
left_watch_sequences.append(seg)
|
| 219 |
+
if seg.text in markers:
|
| 220 |
+
seg_index = int(seg.t1 / 100 * self.sample_rate)
|
| 221 |
+
rest_buffer_duration = (len(audio_buffer) - seg_index) / self.sample_rate
|
| 222 |
+
# is_end = any(i in seg.text for i in config.SENTENCE_END_MARKERS)
|
| 223 |
+
right_watch_sequences = segments[min(idx+1, len(segments)):]
|
| 224 |
+
if rest_buffer_duration >= 1.5:
|
| 225 |
+
left_watch_idx = seg_index
|
| 226 |
+
break
|
| 227 |
+
return left_watch_idx, left_watch_sequences, right_watch_sequences, is_end
|
| 228 |
+
|
| 229 |
+
def analysis_segments(self, segments, audio_buffer: np.ndarray):
|
| 230 |
+
# 找到第一个标点符号作为锚点 左边为确认段,右边为观察段,
|
| 231 |
+
# 当左边确认后,右边段才会进入观察
|
| 232 |
+
# 当左边确认后,会从缓冲区中删除对应的buffer,减少下次输入的数据量
|
| 233 |
+
left_watch_idx, left_watch_sequences, right_watch_sequences, is_end_sentence = self._segments_split(segments, audio_buffer)
|
| 234 |
+
left_watch_string = "".join(i.text for i in left_watch_sequences)
|
| 235 |
+
right_watch_string = "".join(i.text for i in right_watch_sequences)
|
| 236 |
+
|
| 237 |
+
if left_watch_idx != 0:
|
| 238 |
+
# 将观察字符串临时存储
|
| 239 |
+
self._text_buffer.add_entry(left_watch_string, left_watch_idx)
|
| 240 |
+
audio_cut_index = self._text_buffer.get_final_index()
|
| 241 |
+
if audio_cut_index:
|
| 242 |
+
return audio_cut_index, left_watch_string, right_watch_string, is_end_sentence
|
| 243 |
+
return None, left_watch_string, right_watch_string, is_end_sentence
|
| 244 |
+
|
| 245 |
+
def speech_to_text(self):
|
| 246 |
+
while True:
|
| 247 |
+
if self.exit:
|
| 248 |
+
logger.info("Exiting speech to text thread")
|
| 249 |
+
self._pool.shutdown(wait=False, cancel_futures=True)
|
| 250 |
+
break
|
| 251 |
+
|
| 252 |
+
if self.frames_np is None:
|
| 253 |
+
time.sleep(0.02) # wait for any audio to arrive
|
| 254 |
+
continue
|
| 255 |
+
|
| 256 |
+
audio_buffer = self.get_audio_chunk_for_processing()
|
| 257 |
+
# logger.info(f"[pywhispercpp:] Processing audio with duration: {len(audio_buffer)}")
|
| 258 |
+
# segments = self.transcribe_audio(audio_buffer)
|
| 259 |
+
try:
|
| 260 |
+
logger.info(f"[pywhispercpp:] Processing audio with duration: {len(audio_buffer)}")
|
| 261 |
+
segments = self.transcribe_audio(audio_buffer)
|
| 262 |
+
except KeyboardInterrupt:
|
| 263 |
+
break
|
| 264 |
+
except Exception as e:
|
| 265 |
+
logger.error(f"[ERROR]: {e}")
|
| 266 |
+
else:
|
| 267 |
+
self.handle_transcription_output(segments, audio_buffer)
|
| 268 |
+
|
| 269 |
+
|
| 270 |
+
|
| 271 |
+
def handle_transcription_output(self, segments, audio_buffer):
|
| 272 |
+
texts = "".join(i.text for i in segments)
|
| 273 |
+
self._segment_manager.handle(texts)
|
| 274 |
+
# 分析句子
|
| 275 |
+
last_cut_index, left_string, right_string, is_end_sentence = self.analysis_segments(segments, audio_buffer)
|
| 276 |
+
# print(last_cut_index, left_string, right_string, is_end_sentence)
|
| 277 |
+
if last_cut_index:
|
| 278 |
+
self.update_audio_buffer(last_cut_index)
|
| 279 |
+
# 句子或者短句的提交
|
| 280 |
+
self._segment_manager.handle(left_string).commit(is_end_sentence)
|
| 281 |
+
self._segment_manager.handle(right_string)
|
| 282 |
+
|
| 283 |
+
if is_end_sentence and last_cut_index:
|
| 284 |
+
message = self._segment_manager.segment
|
| 285 |
+
seg_id = self._segment_manager.get_seg_id() - 1
|
| 286 |
+
# elapsed_time = time.time() - start_time
|
| 287 |
+
# formatted_time = f"{int(elapsed_time // 60):02}:{int(elapsed_time % 60):02}:{(elapsed_time % 1) * 1000:03.0f}"
|
| 288 |
+
print(seg_id, message,)
|
| 289 |
+
print(seg_id + 1, self._segment_manager.string,)
|
| 290 |
+
|
| 291 |
+
else:
|
| 292 |
+
seg_id = self._segment_manager.get_seg_id()
|
| 293 |
+
message = self._segment_manager.short_sentence + self._segment_manager.string
|
| 294 |
+
# print(self._segment_manager.__dict__)
|
| 295 |
+
# elapsed_time = time.time() - start_time
|
| 296 |
+
# formatted_time = f"{int(elapsed_time // 60):02}:{int(elapsed_time % 60):02}:{(elapsed_time % 1) * 1000:03.0f}"
|
| 297 |
+
print(seg_id, message)
|
| 298 |
+
|
| 299 |
+
def send_to_client(self, data_dict):
|
| 300 |
+
content = {
|
| 301 |
+
"uid": self.client_uid,
|
| 302 |
+
**data_dict
|
| 303 |
+
}
|
| 304 |
+
try:
|
| 305 |
+
self.websocket.send(
|
| 306 |
+
json.dumps(content)
|
| 307 |
+
)
|
| 308 |
+
except Exception as e:
|
| 309 |
+
logger.error(f"[ERROR]: Sending data to client: {e}")
|
| 310 |
+
|
| 311 |
+
def get_audio_chunk_for_processing(self):
|
| 312 |
+
if self.frames_np.shape[0] >= self.sample_rate * 1:
|
| 313 |
+
return self.frames_np.copy()
|
| 314 |
+
# 计算需要填充的样本数
|
| 315 |
+
padding_length = self.sample_rate * 1 - len(self.frames_np)
|
| 316 |
+
# 创建静音填充(零值)
|
| 317 |
+
silence = np.zeros(padding_length + int(0.01 * self.sample_rate), dtype=np.float32)
|
| 318 |
+
# 拼接原始音频和静音填充
|
| 319 |
+
padded_audio = np.concatenate([silence, self.frames_np])
|
| 320 |
+
return padded_audio.copy()
|
| 321 |
+
|
| 322 |
+
def cleanup(self):
|
| 323 |
+
logger.info("start shut down worker pool.")
|
| 324 |
+
self._pool.shutdown(wait=False, cancel_futures=True)
|
| 325 |
+
logger.info("shut down worker pool success.")
|
| 326 |
+
return super().cleanup()#
|
uv.lock
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|