daihui.zhang commited on
Commit
586518f
·
1 Parent(s): a257a82

init whispercpp transcirption

Browse files
.gitignore CHANGED
@@ -1,7 +1,6 @@
1
  __pycache__/
2
  *.py[cod]
3
  *$py.class
4
-
5
  # C extensions
6
  *.so
7
 
 
1
  __pycache__/
2
  *.py[cod]
3
  *$py.class
 
4
  # C extensions
5
  *.so
6
 
.python-version ADDED
@@ -0,0 +1 @@
 
 
1
+ 3.11
config.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pathlib
2
+
3
+
4
+ BASE_DIR = pathlib.Path(__file__).parent
5
+ MODEL_DIR = BASE_DIR / "moyoyo_asr_models"
6
+ # 标点
7
+ SENTENCE_END_MARKERS = ['.', '!', '?', '。', '!', '?', ';', ';', ':', ':']
8
+ PAUSE_END_MARKERS = [',', ',', '、']
9
+
10
+ # whisper推理参数
11
+ WHISPER_PROMPT_ZH = "以下是简体中文普通话的句子。"
12
+ MAX_LENTH_ZH = 3
13
+
14
+ WHISPER_PROMPT_EN = "The following is an English sentence."
15
+ MAX_LENGTH_EN= 3
16
+
17
+ WHISPER_MODEL = 'medium-q5_0'
18
+
19
+
pyproject.toml ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [project]
2
+ name = "trans"
3
+ version = "0.1.0"
4
+ description = "Add your description here"
5
+ readme = "README.md"
6
+ requires-python = ">=3.11"
7
+ dependencies = [
8
+ "av>=14.2.0",
9
+ "librosa>=0.11.0",
10
+ "numpy>=2.1.3",
11
+ "onnxruntime>=1.21.0",
12
+ "pyaudio>=0.2.14",
13
+ "setuptools>=78.1.0",
14
+ "soundfile>=0.13.1",
15
+ "torch>=2.6.0",
16
+ "tqdm>=4.67.1",
17
+ "websocket-client>=1.8.0",
18
+ "websockets>=15.0.1",
19
+ ]
requirements.txt ADDED
@@ -0,0 +1,114 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This file was autogenerated by uv via the following command:
2
+ # uv pip compile pyproject.toml -o requirements.txt
3
+ audioread==3.0.1
4
+ # via librosa
5
+ av==14.3.0
6
+ # via trans (pyproject.toml)
7
+ certifi==2025.1.31
8
+ # via requests
9
+ cffi==1.17.1
10
+ # via soundfile
11
+ charset-normalizer==3.4.1
12
+ # via requests
13
+ coloredlogs==15.0.1
14
+ # via onnxruntime
15
+ decorator==5.2.1
16
+ # via librosa
17
+ filelock==3.18.0
18
+ # via torch
19
+ flatbuffers==25.2.10
20
+ # via onnxruntime
21
+ fsspec==2025.3.2
22
+ # via torch
23
+ humanfriendly==10.0
24
+ # via coloredlogs
25
+ idna==3.10
26
+ # via requests
27
+ jinja2==3.1.6
28
+ # via torch
29
+ joblib==1.4.2
30
+ # via
31
+ # librosa
32
+ # scikit-learn
33
+ lazy-loader==0.4
34
+ # via librosa
35
+ librosa==0.11.0
36
+ # via trans (pyproject.toml)
37
+ llvmlite==0.44.0
38
+ # via numba
39
+ markupsafe==3.0.2
40
+ # via jinja2
41
+ mpmath==1.3.0
42
+ # via sympy
43
+ msgpack==1.1.0
44
+ # via librosa
45
+ networkx==3.4.2
46
+ # via torch
47
+ numba==0.61.0
48
+ # via librosa
49
+ numpy==2.1.3
50
+ # via
51
+ # trans (pyproject.toml)
52
+ # librosa
53
+ # numba
54
+ # onnxruntime
55
+ # scikit-learn
56
+ # scipy
57
+ # soundfile
58
+ # soxr
59
+ onnxruntime==1.21.0
60
+ # via trans (pyproject.toml)
61
+ packaging==24.2
62
+ # via
63
+ # lazy-loader
64
+ # onnxruntime
65
+ # pooch
66
+ platformdirs==4.3.7
67
+ # via pooch
68
+ pooch==1.8.2
69
+ # via librosa
70
+ protobuf==6.30.2
71
+ # via onnxruntime
72
+ pyaudio==0.2.14
73
+ # via trans (pyproject.toml)
74
+ pycparser==2.22
75
+ # via cffi
76
+ requests==2.32.3
77
+ # via pooch
78
+ scikit-learn==1.6.1
79
+ # via librosa
80
+ scipy==1.15.2
81
+ # via
82
+ # librosa
83
+ # scikit-learn
84
+ setuptools==78.1.0
85
+ # via trans (pyproject.toml)
86
+ soundfile==0.13.1
87
+ # via
88
+ # trans (pyproject.toml)
89
+ # librosa
90
+ soxr==0.5.0.post1
91
+ # via librosa
92
+ sympy==1.13.1
93
+ # via
94
+ # onnxruntime
95
+ # torch
96
+ threadpoolctl==3.6.0
97
+ # via scikit-learn
98
+ torch==2.6.0
99
+ # via trans (pyproject.toml)
100
+ tqdm==4.67.1
101
+ # via trans (pyproject.toml)
102
+ typing-extensions==4.13.1
103
+ # via
104
+ # librosa
105
+ # torch
106
+ urllib3==2.3.0
107
+ # via requests
108
+ websocket-client==1.8.0
109
+ # via trans (pyproject.toml)
110
+ websockets==15.0.1
111
+ # via trans (pyproject.toml)
112
+
113
+ llama-cpp-python
114
+ pywhispercpp
run_client.py CHANGED
@@ -2,7 +2,7 @@ from transcribe.client import TranscriptionClient
2
 
3
  client = TranscriptionClient(
4
  "localhost",
5
- 9000,
6
  lang="zh",
7
  save_output_recording=False, # Only used for microphone input, False by Default
8
  output_recording_filename="./output_recording.wav", # Only used for microphone input
 
2
 
3
  client = TranscriptionClient(
4
  "localhost",
5
+ 9090,
6
  lang="zh",
7
  save_output_recording=False, # Only used for microphone input, False by Default
8
  output_recording_filename="./output_recording.wav", # Only used for microphone input
transcribe/__pycache__/__init__.cpython-311.pyc DELETED
Binary file (183 Bytes)
 
transcribe/__pycache__/client.cpython-311.pyc DELETED
Binary file (39 kB)
 
transcribe/__pycache__/server.cpython-311.pyc DELETED
Binary file (36 kB)
 
transcribe/__pycache__/utils.cpython-311.pyc DELETED
Binary file (4.64 kB)
 
transcribe/__pycache__/vad.cpython-311.pyc DELETED
Binary file (9.36 kB)
 
transcribe/server/__init__.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+
2
+ from .transcription import TranscriptionServer
transcribe/{server.py → server/base.py} RENAMED
@@ -1,324 +1,17 @@
1
- import functools
2
  import json
3
  import logging
4
  import pathlib
5
  import threading
6
  import time
7
- from enum import Enum
8
- from typing import List, Optional
9
-
10
  import librosa
11
  import numpy as np
12
  import soundfile
13
  from pywhispercpp.model import Model
14
- from websockets.exceptions import ConnectionClosed
15
- from websockets.sync.server import serve
16
-
17
- from transcribe.vad import VoiceActivityDetector
18
 
19
  logging.basicConfig(level=logging.INFO)
20
 
21
-
22
- class ClientManager:
23
- def __init__(self, max_clients=4, max_connection_time=600):
24
- """
25
- Initializes the ClientManager with specified limits on client connections and connection durations.
26
-
27
- Args:
28
- max_clients (int, optional): The maximum number of simultaneous client connections allowed. Defaults to 4.
29
- max_connection_time (int, optional): The maximum duration (in seconds) a client can stay connected. Defaults
30
- to 600 seconds (10 minutes).
31
- """
32
- self.clients = {}
33
- self.start_times = {}
34
- self.max_clients = max_clients
35
- self.max_connection_time = max_connection_time
36
-
37
- def add_client(self, websocket, client):
38
- """
39
- Adds a client and their connection start time to the tracking dictionaries.
40
-
41
- Args:
42
- websocket: The websocket associated with the client to add.
43
- client: The client object to be added and tracked.
44
- """
45
- self.clients[websocket] = client
46
- self.start_times[websocket] = time.time()
47
-
48
- def get_client(self, websocket):
49
- """
50
- Retrieves a client associated with the given websocket.
51
-
52
- Args:
53
- websocket: The websocket associated with the client to retrieve.
54
-
55
- Returns:
56
- The client object if found, False otherwise.
57
- """
58
- if websocket in self.clients:
59
- return self.clients[websocket]
60
- return False
61
-
62
- def remove_client(self, websocket):
63
- """
64
- Removes a client and their connection start time from the tracking dictionaries. Performs cleanup on the
65
- client if necessary.
66
-
67
- Args:
68
- websocket: The websocket associated with the client to be removed.
69
- """
70
- client = self.clients.pop(websocket, None)
71
- if client:
72
- client.cleanup()
73
- self.start_times.pop(websocket, None)
74
-
75
- def get_wait_time(self):
76
- """
77
- Calculates the estimated wait time for new clients based on the remaining connection times of current clients.
78
-
79
- Returns:
80
- The estimated wait time in minutes for new clients to connect. Returns 0 if there are available slots.
81
- """
82
- wait_time = None
83
- for start_time in self.start_times.values():
84
- current_client_time_remaining = self.max_connection_time - (time.time() - start_time)
85
- if wait_time is None or current_client_time_remaining < wait_time:
86
- wait_time = current_client_time_remaining
87
- return wait_time / 60 if wait_time is not None else 0
88
-
89
- def is_server_full(self, websocket, options):
90
- """
91
- Checks if the server is at its maximum client capacity and sends a wait message to the client if necessary.
92
-
93
- Args:
94
- websocket: The websocket of the client attempting to connect.
95
- options: A dictionary of options that may include the client's unique identifier.
96
-
97
- Returns:
98
- True if the server is full, False otherwise.
99
- """
100
- if len(self.clients) >= self.max_clients:
101
- wait_time = self.get_wait_time()
102
- response = {"uid": options["uid"], "status": "WAIT", "message": wait_time}
103
- websocket.send(json.dumps(response))
104
- return True
105
- return False
106
-
107
- def is_client_timeout(self, websocket):
108
- """
109
- Checks if a client has exceeded the maximum allowed connection time and disconnects them if so, issuing a warning.
110
-
111
- Args:
112
- websocket: The websocket associated with the client to check.
113
-
114
- Returns:
115
- True if the client's connection time has exceeded the maximum limit, False otherwise.
116
- """
117
- elapsed_time = time.time() - self.start_times[websocket]
118
- if elapsed_time >= self.max_connection_time:
119
- self.clients[websocket].disconnect()
120
- logging.warning(f"Client with uid '{self.clients[websocket].client_uid}' disconnected due to overtime.")
121
- return True
122
- return False
123
-
124
-
125
- class BackendType(Enum):
126
- PYWHISPERCPP = "pywhispercpp"
127
-
128
- @staticmethod
129
- def valid_types() -> List[str]:
130
- return [backend_type.value for backend_type in BackendType]
131
-
132
- @staticmethod
133
- def is_valid(backend: str) -> bool:
134
- return backend in BackendType.valid_types()
135
-
136
- def is_pywhispercpp(self) -> bool:
137
- return self == BackendType.PYWHISPERCPP
138
-
139
-
140
- class TranscriptionServer:
141
- RATE = 16000
142
-
143
- def __init__(self):
144
- self.client_manager = None
145
- self.no_voice_activity_chunks = 0
146
- self.single_model = False
147
-
148
- def initialize_client(
149
- self, websocket, options
150
- ):
151
- client: Optional[ServeClientBase] = None
152
-
153
- if self.backend.is_pywhispercpp():
154
- client = ServeClientWhisperCPP(
155
- websocket,
156
- language=options["language"],
157
- client_uid=options["uid"],
158
- single_model=self.single_model,
159
- )
160
- logging.info("Running pywhispercpp backend.")
161
-
162
- if client is None:
163
- raise ValueError(f"Backend type {self.backend.value} not recognised or not handled.")
164
-
165
- self.client_manager.add_client(websocket, client)
166
-
167
- def get_audio_from_websocket(self, websocket):
168
- """
169
- Receives audio buffer from websocket and creates a numpy array out of it.
170
-
171
- Args:
172
- websocket: The websocket to receive audio from.
173
-
174
- Returns:
175
- A numpy array containing the audio.
176
- """
177
- frame_data = websocket.recv()
178
- if frame_data == b"END_OF_AUDIO":
179
- return False
180
- return np.frombuffer(frame_data, dtype=np.float32)
181
-
182
- def handle_new_connection(self, websocket):
183
- try:
184
- logging.info("New client connected")
185
- options = websocket.recv()
186
- options = json.loads(options)
187
-
188
- if self.client_manager is None:
189
- max_clients = options.get('max_clients', 4)
190
- max_connection_time = options.get('max_connection_time', 600)
191
- self.client_manager = ClientManager(max_clients, max_connection_time)
192
-
193
- if self.client_manager.is_server_full(websocket, options):
194
- websocket.close()
195
- return False # Indicates that the connection should not continue
196
-
197
- if self.backend.is_pywhispercpp():
198
- self.vad_detector = VoiceActivityDetector(frame_rate=self.RATE)
199
-
200
- self.initialize_client(websocket, options)
201
-
202
- return True
203
- except json.JSONDecodeError:
204
- logging.error("Failed to decode JSON from client")
205
- return False
206
- except ConnectionClosed:
207
- logging.info("Connection closed by client")
208
- return False
209
- except Exception as e:
210
- logging.error(f"Error during new connection initialization: {str(e)}")
211
- return False
212
-
213
- def process_audio_frames(self, websocket):
214
- frame_np = self.get_audio_from_websocket(websocket)
215
- client = self.client_manager.get_client(websocket)
216
-
217
- # TODO Vad has some problem, it will be blocking process loop
218
- # if frame_np is False:
219
- # if self.backend.is_pywhispercpp():
220
- # client.set_eos(True)
221
- # return False
222
-
223
- # if self.backend.is_pywhispercpp():
224
- # voice_active = self.voice_activity(websocket, frame_np)
225
- # if voice_active:
226
- # self.no_voice_activity_chunks = 0
227
- # client.set_eos(False)
228
- # if self.use_vad and not voice_active:
229
- # return True
230
-
231
- client.add_frames(frame_np)
232
- return True
233
-
234
- def recv_audio(self,
235
- websocket,
236
- backend: BackendType = BackendType.PYWHISPERCPP):
237
-
238
- self.backend = backend
239
- if not self.handle_new_connection(websocket):
240
- return
241
-
242
- try:
243
- while not self.client_manager.is_client_timeout(websocket):
244
- if not self.process_audio_frames(websocket):
245
- break
246
- except ConnectionClosed:
247
- logging.info("Connection closed by client")
248
- except Exception as e:
249
- logging.error(f"Unexpected error: {str(e)}")
250
- finally:
251
- if self.client_manager.get_client(websocket):
252
- self.cleanup(websocket)
253
- websocket.close()
254
- del websocket
255
-
256
- def run(self,
257
- host,
258
- port=9090,
259
- backend="pywhispercpp"):
260
- """
261
- Run the transcription server.
262
-
263
- Args:
264
- host (str): The host address to bind the server.
265
- port (int): The port number to bind the server.
266
- """
267
-
268
- if not BackendType.is_valid(backend):
269
- raise ValueError(f"{backend} is not a valid backend type. Choose backend from {BackendType.valid_types()}")
270
-
271
- with serve(
272
- functools.partial(
273
- self.recv_audio,
274
- backend=BackendType(backend),
275
- ),
276
- host,
277
- port
278
- ) as server:
279
- server.serve_forever()
280
-
281
- def voice_activity(self, websocket, frame_np):
282
- """
283
- Evaluates the voice activity in a given audio frame and manages the state of voice activity detection.
284
-
285
- This method uses the configured voice activity detection (VAD) model to assess whether the given audio frame
286
- contains speech. If the VAD model detects no voice activity for more than three consecutive frames,
287
- it sets an end-of-speech (EOS) flag for the associated client. This method aims to efficiently manage
288
- speech detection to improve subsequent processing steps.
289
-
290
- Args:
291
- websocket: The websocket associated with the current client. Used to retrieve the client object
292
- from the client manager for state management.
293
- frame_np (numpy.ndarray): The audio frame to be analyzed. This should be a NumPy array containing
294
- the audio data for the current frame.
295
-
296
- Returns:
297
- bool: True if voice activity is detected in the current frame, False otherwise. When returning False
298
- after detecting no voice activity for more than three consecutive frames, it also triggers the
299
- end-of-speech (EOS) flag for the client.
300
- """
301
- if not self.vad_detector(frame_np):
302
- self.no_voice_activity_chunks += 1
303
- if self.no_voice_activity_chunks > 3:
304
- client = self.client_manager.get_client(websocket)
305
- if not client.eos:
306
- client.set_eos(True)
307
- time.sleep(0.1) # Sleep 100m; wait some voice activity.
308
- return False
309
- return True
310
-
311
- def cleanup(self, websocket):
312
- """
313
- Cleans up resources associated with a given client's websocket.
314
-
315
- Args:
316
- websocket: The websocket associated with the client to be cleaned up.
317
- """
318
- if self.client_manager.get_client(websocket):
319
- self.client_manager.remove_client(websocket)
320
-
321
-
322
  class ServeClientBase(object):
323
  RATE = 16000
324
  SERVER_READY = "SERVER_READY"
@@ -442,6 +135,7 @@ class ServeClientBase(object):
442
  segments = self.transcript.copy()
443
  if last_segment is not None:
444
  segments = segments + [last_segment]
 
445
  return segments
446
 
447
  def get_audio_chunk_duration(self, input_bytes):
@@ -549,10 +243,8 @@ class ServeClientWhisperCPP(ServeClientBase):
549
  """
550
  Instantiates a new model, sets it as the transcriber and does warmup if desired.
551
  """
552
- model = 'medium-q5_0'
553
- here = pathlib.Path(__file__)
554
- models_dir = f'{here.parent.parent / "moyoyo_asr_models"}'
555
- self.transcriber = Model(model=model, models_dir=models_dir)
556
  if warmup:
557
  self.warmup()
558
 
@@ -610,8 +302,15 @@ class ServeClientWhisperCPP(ServeClientBase):
610
  prompt = '以下是简体中文普通话的句子。'
611
  else:
612
  prompt = 'The following is an English sentence.'
613
-
614
- segments = self.transcriber.transcribe(mel, language='zh', initial_prompt=prompt, print_progress=False)
 
 
 
 
 
 
 
615
  text = []
616
  for segment in segments:
617
  content = segment.text
 
1
+
2
  import json
3
  import logging
4
  import pathlib
5
  import threading
6
  import time
7
+ import config
 
 
8
  import librosa
9
  import numpy as np
10
  import soundfile
11
  from pywhispercpp.model import Model
 
 
 
 
12
 
13
  logging.basicConfig(level=logging.INFO)
14
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  class ServeClientBase(object):
16
  RATE = 16000
17
  SERVER_READY = "SERVER_READY"
 
135
  segments = self.transcript.copy()
136
  if last_segment is not None:
137
  segments = segments + [last_segment]
138
+ logging.info(f"{segments}")
139
  return segments
140
 
141
  def get_audio_chunk_duration(self, input_bytes):
 
243
  """
244
  Instantiates a new model, sets it as the transcriber and does warmup if desired.
245
  """
246
+
247
+ self.transcriber = Model(model=config.WHISPER_MODEL, models_dir=config.MODEL_DIR)
 
 
248
  if warmup:
249
  self.warmup()
250
 
 
302
  prompt = '以下是简体中文普通话的句子。'
303
  else:
304
  prompt = 'The following is an English sentence.'
305
+
306
+ segments = self.transcriber.transcribe(
307
+ mel,
308
+ language=self.language,
309
+ initial_prompt=prompt,
310
+ token_timestamps=True,
311
+ # max_len=max_len,
312
+ print_progress=False
313
+ )
314
  text = []
315
  for segment in segments:
316
  content = segment.text
transcribe/server/transcription.py ADDED
@@ -0,0 +1,316 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import time
3
+ import functools
4
+ import json
5
+ import logging
6
+ import time
7
+ from enum import Enum
8
+ from typing import List, Optional
9
+ import numpy as np
10
+ from .base import ServeClientBase, ServeClientWhisperCPP
11
+ from .whispercpp import PyWhiperCppServe
12
+ from ..vad import VoiceActivityDetector
13
+ from websockets.exceptions import ConnectionClosed
14
+ from websockets.sync.server import serve
15
+
16
+ logging.basicConfig(level=logging.INFO)
17
+
18
+
19
+ class ClientManager:
20
+ def __init__(self, max_clients=4, max_connection_time=600):
21
+ """
22
+ Initializes the ClientManager with specified limits on client connections and connection durations.
23
+
24
+ Args:
25
+ max_clients (int, optional): The maximum number of simultaneous client connections allowed. Defaults to 4.
26
+ max_connection_time (int, optional): The maximum duration (in seconds) a client can stay connected. Defaults
27
+ to 600 seconds (10 minutes).
28
+ """
29
+ self.clients = {}
30
+ self.start_times = {}
31
+ self.max_clients = max_clients
32
+ self.max_connection_time = max_connection_time
33
+
34
+ def add_client(self, websocket, client):
35
+ """
36
+ Adds a client and their connection start time to the tracking dictionaries.
37
+
38
+ Args:
39
+ websocket: The websocket associated with the client to add.
40
+ client: The client object to be added and tracked.
41
+ """
42
+ self.clients[websocket] = client
43
+ self.start_times[websocket] = time.time()
44
+
45
+ def get_client(self, websocket):
46
+ """
47
+ Retrieves a client associated with the given websocket.
48
+
49
+ Args:
50
+ websocket: The websocket associated with the client to retrieve.
51
+
52
+ Returns:
53
+ The client object if found, False otherwise.
54
+ """
55
+ if websocket in self.clients:
56
+ return self.clients[websocket]
57
+ return False
58
+
59
+ def remove_client(self, websocket):
60
+ """
61
+ Removes a client and their connection start time from the tracking dictionaries. Performs cleanup on the
62
+ client if necessary.
63
+
64
+ Args:
65
+ websocket: The websocket associated with the client to be removed.
66
+ """
67
+ client = self.clients.pop(websocket, None)
68
+ if client:
69
+ client.cleanup()
70
+ self.start_times.pop(websocket, None)
71
+
72
+ def get_wait_time(self):
73
+ """
74
+ Calculates the estimated wait time for new clients based on the remaining connection times of current clients.
75
+
76
+ Returns:
77
+ The estimated wait time in minutes for new clients to connect. Returns 0 if there are available slots.
78
+ """
79
+ wait_time = None
80
+ for start_time in self.start_times.values():
81
+ current_client_time_remaining = self.max_connection_time - (time.time() - start_time)
82
+ if wait_time is None or current_client_time_remaining < wait_time:
83
+ wait_time = current_client_time_remaining
84
+ return wait_time / 60 if wait_time is not None else 0
85
+
86
+ def is_server_full(self, websocket, options):
87
+ """
88
+ Checks if the server is at its maximum client capacity and sends a wait message to the client if necessary.
89
+
90
+ Args:
91
+ websocket: The websocket of the client attempting to connect.
92
+ options: A dictionary of options that may include the client's unique identifier.
93
+
94
+ Returns:
95
+ True if the server is full, False otherwise.
96
+ """
97
+ if len(self.clients) >= self.max_clients:
98
+ wait_time = self.get_wait_time()
99
+ response = {"uid": options["uid"], "status": "WAIT", "message": wait_time}
100
+ websocket.send(json.dumps(response))
101
+ return True
102
+ return False
103
+
104
+ def is_client_timeout(self, websocket):
105
+ """
106
+ Checks if a client has exceeded the maximum allowed connection time and disconnects them if so, issuing a warning.
107
+
108
+ Args:
109
+ websocket: The websocket associated with the client to check.
110
+
111
+ Returns:
112
+ True if the client's connection time has exceeded the maximum limit, False otherwise.
113
+ """
114
+ elapsed_time = time.time() - self.start_times[websocket]
115
+ if elapsed_time >= self.max_connection_time:
116
+ self.clients[websocket].disconnect()
117
+ logging.warning(f"Client with uid '{self.clients[websocket].client_uid}' disconnected due to overtime.")
118
+ return True
119
+ return False
120
+
121
+
122
+ class BackendType(Enum):
123
+ PYWHISPERCPP = "pywhispercpp"
124
+
125
+ @staticmethod
126
+ def valid_types() -> List[str]:
127
+ return [backend_type.value for backend_type in BackendType]
128
+
129
+ @staticmethod
130
+ def is_valid(backend: str) -> bool:
131
+ return backend in BackendType.valid_types()
132
+
133
+ def is_pywhispercpp(self) -> bool:
134
+ return self == BackendType.PYWHISPERCPP
135
+
136
+
137
+ class TranscriptionServer:
138
+ RATE = 16000
139
+
140
+ def __init__(self):
141
+ self.client_manager = None
142
+ self.no_voice_activity_chunks = 0
143
+ self.single_model = False
144
+
145
+ def initialize_client(
146
+ self, websocket, options
147
+ ):
148
+ client: Optional[ServeClientBase] = None
149
+
150
+ if self.backend.is_pywhispercpp():
151
+ client = PyWhiperCppServe(
152
+ websocket,
153
+ language=options["language"],
154
+ client_uid=options["uid"],
155
+ )
156
+ logging.info("Running pywhispercpp backend.")
157
+
158
+ if client is None:
159
+ raise ValueError(f"Backend type {self.backend.value} not recognised or not handled.")
160
+
161
+ self.client_manager.add_client(websocket, client)
162
+
163
+ def get_audio_from_websocket(self, websocket):
164
+ """
165
+ Receives audio buffer from websocket and creates a numpy array out of it.
166
+
167
+ Args:
168
+ websocket: The websocket to receive audio from.
169
+
170
+ Returns:
171
+ A numpy array containing the audio.
172
+ """
173
+ frame_data = websocket.recv()
174
+ if frame_data == b"END_OF_AUDIO":
175
+ return False
176
+ return np.frombuffer(frame_data, dtype=np.float32)
177
+
178
+ def handle_new_connection(self, websocket):
179
+ try:
180
+ logging.info("New client connected")
181
+ options = websocket.recv()
182
+ options = json.loads(options)
183
+
184
+ if self.client_manager is None:
185
+ max_clients = options.get('max_clients', 4)
186
+ max_connection_time = options.get('max_connection_time', 600)
187
+ self.client_manager = ClientManager(max_clients, max_connection_time)
188
+
189
+ if self.client_manager.is_server_full(websocket, options):
190
+ websocket.close()
191
+ return False # Indicates that the connection should not continue
192
+
193
+ if self.backend.is_pywhispercpp():
194
+ self.vad_detector = VoiceActivityDetector(frame_rate=self.RATE)
195
+
196
+ self.initialize_client(websocket, options)
197
+
198
+ return True
199
+ except json.JSONDecodeError:
200
+ logging.error("Failed to decode JSON from client")
201
+ return False
202
+ except ConnectionClosed:
203
+ logging.info("Connection closed by client")
204
+ return False
205
+ except Exception as e:
206
+ logging.error(f"Error during new connection initialization: {str(e)}")
207
+ return False
208
+
209
+ def process_audio_frames(self, websocket):
210
+ frame_np = self.get_audio_from_websocket(websocket)
211
+ client = self.client_manager.get_client(websocket)
212
+
213
+ # TODO Vad has some problem, it will be blocking process loop
214
+ # if frame_np is False:
215
+ # if self.backend.is_pywhispercpp():
216
+ # client.set_eos(True)
217
+ # return False
218
+
219
+ # if self.backend.is_pywhispercpp():
220
+ # voice_active = self.voice_activity(websocket, frame_np)
221
+ # if voice_active:
222
+ # self.no_voice_activity_chunks = 0
223
+ # client.set_eos(False)
224
+ # if self.use_vad and not voice_active:
225
+ # return True
226
+
227
+ client.add_frames(frame_np)
228
+ return True
229
+
230
+ def recv_audio(self,
231
+ websocket,
232
+ backend: BackendType = BackendType.PYWHISPERCPP):
233
+
234
+ self.backend = backend
235
+ if not self.handle_new_connection(websocket):
236
+ return
237
+
238
+ try:
239
+ while not self.client_manager.is_client_timeout(websocket):
240
+ if not self.process_audio_frames(websocket):
241
+ break
242
+ except ConnectionClosed:
243
+ logging.info("Connection closed by client")
244
+ except Exception as e:
245
+ logging.error(f"Unexpected error: {str(e)}")
246
+ finally:
247
+ if self.client_manager.get_client(websocket):
248
+ self.cleanup(websocket)
249
+ websocket.close()
250
+ del websocket
251
+
252
+ def run(self,
253
+ host,
254
+ port=9090,
255
+ backend="pywhispercpp"):
256
+ """
257
+ Run the transcription server.
258
+
259
+ Args:
260
+ host (str): The host address to bind the server.
261
+ port (int): The port number to bind the server.
262
+ """
263
+
264
+ if not BackendType.is_valid(backend):
265
+ raise ValueError(f"{backend} is not a valid backend type. Choose backend from {BackendType.valid_types()}")
266
+
267
+ with serve(
268
+ functools.partial(
269
+ self.recv_audio,
270
+ backend=BackendType(backend),
271
+ ),
272
+ host,
273
+ port
274
+ ) as server:
275
+ server.serve_forever()
276
+
277
+ def voice_activity(self, websocket, frame_np):
278
+ """
279
+ Evaluates the voice activity in a given audio frame and manages the state of voice activity detection.
280
+
281
+ This method uses the configured voice activity detection (VAD) model to assess whether the given audio frame
282
+ contains speech. If the VAD model detects no voice activity for more than three consecutive frames,
283
+ it sets an end-of-speech (EOS) flag for the associated client. This method aims to efficiently manage
284
+ speech detection to improve subsequent processing steps.
285
+
286
+ Args:
287
+ websocket: The websocket associated with the current client. Used to retrieve the client object
288
+ from the client manager for state management.
289
+ frame_np (numpy.ndarray): The audio frame to be analyzed. This should be a NumPy array containing
290
+ the audio data for the current frame.
291
+
292
+ Returns:
293
+ bool: True if voice activity is detected in the current frame, False otherwise. When returning False
294
+ after detecting no voice activity for more than three consecutive frames, it also triggers the
295
+ end-of-speech (EOS) flag for the client.
296
+ """
297
+ if not self.vad_detector(frame_np):
298
+ self.no_voice_activity_chunks += 1
299
+ if self.no_voice_activity_chunks > 3:
300
+ client = self.client_manager.get_client(websocket)
301
+ if not client.eos:
302
+ client.set_eos(True)
303
+ time.sleep(0.1) # Sleep 100m; wait some voice activity.
304
+ return False
305
+ return True
306
+
307
+ def cleanup(self, websocket):
308
+ """
309
+ Cleans up resources associated with a given client's websocket.
310
+
311
+ Args:
312
+ websocket: The websocket associated with the client to be cleaned up.
313
+ """
314
+ if self.client_manager.get_client(websocket):
315
+ self.client_manager.remove_client(websocket)
316
+
transcribe/server/whispercpp.py ADDED
@@ -0,0 +1,326 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ from .base import ServeClientBase
3
+ from pywhispercpp.model import Model
4
+ import soundfile
5
+ from concurrent.futures import ProcessPoolExecutor as Pool
6
+ import numpy as np
7
+ from logging import getLogger
8
+ from difflib import SequenceMatcher
9
+ import collections
10
+ import config
11
+ import time
12
+ import json
13
+ import threading
14
+
15
+ logger = getLogger("Pywhisper")
16
+
17
+
18
+ class TripleTextBuffer:
19
+ def __init__(self, size=2):
20
+ self.history = collections.deque(maxlen=size)
21
+
22
+ def _clean(self):
23
+ self.history.clear()
24
+
25
+ def add_entry(self, text, index):
26
+ """
27
+ text: 文本
28
+ index: 当前buffer的相对下标 数组索引
29
+ """
30
+ self.history.append((text, index))
31
+
32
+
33
+ def get_final_index(self, similarity_threshold=0.7):
34
+ """根据文本变化,返回可靠的标点的buffer的位置下标"""
35
+ if len(self.history) < 2:
36
+ return None
37
+
38
+ # 获取三次的文本
39
+ text1, _ = self.history[0]
40
+ text2, idx2 = self.history[1]
41
+ # text3, idx3 = self.history[2]
42
+
43
+ # 计算变化程度
44
+ sim_12 = self.text_similarity(text1, text2)
45
+ # print("比较: ", text1, text2," => ", sim_12)
46
+ # sim_23 = self.text_similarity(text2, text3)
47
+ if sim_12 >= similarity_threshold:
48
+ self._clean()
49
+ return idx2
50
+ return None
51
+
52
+ @staticmethod
53
+ def text_similarity(text1, text2):
54
+ return SequenceMatcher(None, text1, text2).ratio()
55
+
56
+
57
+
58
+ class SegmentManager:
59
+ def __init__(self) -> None:
60
+ self._commited_segments = [] # 确定后的段落
61
+ self._commited_short_sentences = [] # 确定后的序列
62
+ self._temp_string = "" # 存储当前临时的文本字符串,直到以句号结尾
63
+
64
+ def handle(self, string):
65
+ self._temp_string = string
66
+ return self
67
+
68
+ @property
69
+ def short_sentence(self) -> str:
70
+ return "".join(self._commited_short_sentences)
71
+
72
+ @property
73
+ def segment(self):
74
+ return self._commited_segments[-1] if len(self._commited_segments) > 0 else ""
75
+
76
+ def get_seg_id(self):
77
+ return len(self._commited_segments)
78
+
79
+ @property
80
+ def string(self):
81
+ return self._temp_string
82
+
83
+
84
+ def commit_short_sentence(self):
85
+ """将临时字符串 提交到临时短句"""
86
+ self._commited_short_sentences.append(self._temp_string)
87
+ self._temp_string = ""
88
+
89
+ def commit_segment(self):
90
+ """将短句 合并 到长句中"""
91
+ self._commited_segments.append(self.short_sentence)
92
+ self._commited_short_sentences = []
93
+
94
+ def commit(self, is_end_sentence=False):
95
+ """
96
+ 当需要切掉的音频部分的时候,将句子提交到短句队列中,并移除临时字符串
97
+ 当完成一个整句的时候提交到段落中
98
+ """
99
+ self.commit_short_sentence()
100
+ if is_end_sentence:
101
+ self.commit_segment()
102
+
103
+
104
+ class PywhisperInference:
105
+
106
+ model = None
107
+
108
+ @classmethod
109
+ def initializer(cls, warmup=True):
110
+ models_dir = config.MODEL_DIR.as_posix()
111
+ cls.model = Model(
112
+ model=config.WHISPER_MODEL,
113
+ models_dir=models_dir,
114
+ print_realtime=False,
115
+ print_progress=False,
116
+ print_timestamps=False,
117
+ )
118
+ if warmup:
119
+ cls.warmup()
120
+
121
+ @classmethod
122
+ def warmup(cls, warmup_steps=1):
123
+ mel, _, = soundfile.read("assets/jfk.flac")
124
+ for _ in range(warmup_steps):
125
+ cls.model.transcribe(mel, print_progress=False)
126
+
127
+ @staticmethod
128
+ def config_language(language):
129
+ if language == "zh":
130
+ return config.MAX_LENTH_ZH, config.WHISPER_PROMPT_ZH
131
+ elif language == "en":
132
+ return config.MAX_LENTH_ZH, config.WHISPER_PROMPT_ZH
133
+ raise ValueError(f"Unsupported language : {language}")
134
+
135
+ @classmethod
136
+ def inference(cls, audio_buffer, language):
137
+ max_len, prompt = cls.config_language(language)
138
+ audio_buffer = np.frombuffer(audio_buffer, dtype=np.float32)
139
+ return cls.model.transcribe(
140
+ audio_buffer,
141
+ initial_prompt=prompt,
142
+ language=language,
143
+ token_timestamps=True,
144
+ max_len=max_len
145
+ )
146
+
147
+
148
+ class PyWhiperCppServe(ServeClientBase):
149
+
150
+
151
+ def __init__(self, websocket, language=None, client_uid=None,):
152
+ super().__init__(client_uid, websocket)
153
+ self.language = language
154
+ # 设置观察字符串 对比上下次的文字来判断字符串的输出是否固定
155
+ self._text_buffer = TripleTextBuffer()
156
+ # 存储转录数据
157
+ self._segment_manager = SegmentManager()
158
+ self.lock = threading.Lock()
159
+ self.frames_np = None
160
+ self.sample_rate = 16000
161
+
162
+ self._pool = Pool(
163
+ max_workers=1, initializer=PywhisperInference.initializer)
164
+
165
+ logger.info('Create a thread to process audio.')
166
+ self.trans_thread = threading.Thread(target=self.speech_to_text)
167
+ self.trans_thread.start()
168
+
169
+ self.websocket.send(json.dumps({
170
+ "uid": self.client_uid,
171
+ "message": self.SERVER_READY,
172
+ "backend": "pywhispercpp"
173
+ }))
174
+
175
+ def add_frames(self, frame_np):
176
+ with self.lock:
177
+ if self.frames_np is None:
178
+ self.frames_np = frame_np.copy()
179
+ else:
180
+ self.frames_np = np.append(self.frames_np,frame_np)
181
+
182
+
183
+ def update_audio_buffer(self, last_offset):
184
+ with self.lock:
185
+ self.frames_np = self.frames_np[last_offset:]
186
+
187
+ def transcribe_audio(self, audio_buffer):
188
+ """
189
+ Transcribe the audio chunk and send the results to the client.
190
+
191
+ Args:
192
+ audio_buffer (np.array): The audio chunk to transcribe.
193
+ """
194
+ fut = self._pool.submit(
195
+ PywhisperInference.inference, audio_buffer.tobytes(), self.language)
196
+ return fut.result()
197
+
198
+
199
+
200
+ def _segments_split(self, segments, audio_buffer: np.ndarray):
201
+ """根据左边第一个标点符号来将序列拆分成 观察段 和 剩余部分"""
202
+ left_watch_sequences = []
203
+ left_watch_idx = 0
204
+ right_watch_sequences = []
205
+ is_end = False
206
+
207
+ if (len(audio_buffer) / self.sample_rate) < 10:
208
+ # 低于10s 使用短句符号比如逗号作为判断依据
209
+ markers = config.PAUSE_END_MARKERS
210
+ is_end = False
211
+ else:
212
+ # 使用句号 问好等长句结尾符号作为判断
213
+ markers = config.SENTENCE_END_MARKERS
214
+ is_end = True
215
+
216
+
217
+ for idx, seg in enumerate(segments):
218
+ left_watch_sequences.append(seg)
219
+ if seg.text in markers:
220
+ seg_index = int(seg.t1 / 100 * self.sample_rate)
221
+ rest_buffer_duration = (len(audio_buffer) - seg_index) / self.sample_rate
222
+ # is_end = any(i in seg.text for i in config.SENTENCE_END_MARKERS)
223
+ right_watch_sequences = segments[min(idx+1, len(segments)):]
224
+ if rest_buffer_duration >= 1.5:
225
+ left_watch_idx = seg_index
226
+ break
227
+ return left_watch_idx, left_watch_sequences, right_watch_sequences, is_end
228
+
229
+ def analysis_segments(self, segments, audio_buffer: np.ndarray):
230
+ # 找到第一个标点符号作为锚点 左边为确认段,右边为观察段,
231
+ # 当左边确认后,右边段才会进入观察
232
+ # 当左边确认后,会从缓冲区中删除对应的buffer,减少下次输入的数据量
233
+ left_watch_idx, left_watch_sequences, right_watch_sequences, is_end_sentence = self._segments_split(segments, audio_buffer)
234
+ left_watch_string = "".join(i.text for i in left_watch_sequences)
235
+ right_watch_string = "".join(i.text for i in right_watch_sequences)
236
+
237
+ if left_watch_idx != 0:
238
+ # 将观察字符串临时存储
239
+ self._text_buffer.add_entry(left_watch_string, left_watch_idx)
240
+ audio_cut_index = self._text_buffer.get_final_index()
241
+ if audio_cut_index:
242
+ return audio_cut_index, left_watch_string, right_watch_string, is_end_sentence
243
+ return None, left_watch_string, right_watch_string, is_end_sentence
244
+
245
+ def speech_to_text(self):
246
+ while True:
247
+ if self.exit:
248
+ logger.info("Exiting speech to text thread")
249
+ self._pool.shutdown(wait=False, cancel_futures=True)
250
+ break
251
+
252
+ if self.frames_np is None:
253
+ time.sleep(0.02) # wait for any audio to arrive
254
+ continue
255
+
256
+ audio_buffer = self.get_audio_chunk_for_processing()
257
+ # logger.info(f"[pywhispercpp:] Processing audio with duration: {len(audio_buffer)}")
258
+ # segments = self.transcribe_audio(audio_buffer)
259
+ try:
260
+ logger.info(f"[pywhispercpp:] Processing audio with duration: {len(audio_buffer)}")
261
+ segments = self.transcribe_audio(audio_buffer)
262
+ except KeyboardInterrupt:
263
+ break
264
+ except Exception as e:
265
+ logger.error(f"[ERROR]: {e}")
266
+ else:
267
+ self.handle_transcription_output(segments, audio_buffer)
268
+
269
+
270
+
271
+ def handle_transcription_output(self, segments, audio_buffer):
272
+ texts = "".join(i.text for i in segments)
273
+ self._segment_manager.handle(texts)
274
+ # 分析句子
275
+ last_cut_index, left_string, right_string, is_end_sentence = self.analysis_segments(segments, audio_buffer)
276
+ # print(last_cut_index, left_string, right_string, is_end_sentence)
277
+ if last_cut_index:
278
+ self.update_audio_buffer(last_cut_index)
279
+ # 句子或者短句的提交
280
+ self._segment_manager.handle(left_string).commit(is_end_sentence)
281
+ self._segment_manager.handle(right_string)
282
+
283
+ if is_end_sentence and last_cut_index:
284
+ message = self._segment_manager.segment
285
+ seg_id = self._segment_manager.get_seg_id() - 1
286
+ # elapsed_time = time.time() - start_time
287
+ # formatted_time = f"{int(elapsed_time // 60):02}:{int(elapsed_time % 60):02}:{(elapsed_time % 1) * 1000:03.0f}"
288
+ print(seg_id, message,)
289
+ print(seg_id + 1, self._segment_manager.string,)
290
+
291
+ else:
292
+ seg_id = self._segment_manager.get_seg_id()
293
+ message = self._segment_manager.short_sentence + self._segment_manager.string
294
+ # print(self._segment_manager.__dict__)
295
+ # elapsed_time = time.time() - start_time
296
+ # formatted_time = f"{int(elapsed_time // 60):02}:{int(elapsed_time % 60):02}:{(elapsed_time % 1) * 1000:03.0f}"
297
+ print(seg_id, message)
298
+
299
+ def send_to_client(self, data_dict):
300
+ content = {
301
+ "uid": self.client_uid,
302
+ **data_dict
303
+ }
304
+ try:
305
+ self.websocket.send(
306
+ json.dumps(content)
307
+ )
308
+ except Exception as e:
309
+ logger.error(f"[ERROR]: Sending data to client: {e}")
310
+
311
+ def get_audio_chunk_for_processing(self):
312
+ if self.frames_np.shape[0] >= self.sample_rate * 1:
313
+ return self.frames_np.copy()
314
+ # 计算需要填充的样本数
315
+ padding_length = self.sample_rate * 1 - len(self.frames_np)
316
+ # 创建静音填充(零值)
317
+ silence = np.zeros(padding_length + int(0.01 * self.sample_rate), dtype=np.float32)
318
+ # 拼接原始音频和静音填充
319
+ padded_audio = np.concatenate([silence, self.frames_np])
320
+ return padded_audio.copy()
321
+
322
+ def cleanup(self):
323
+ logger.info("start shut down worker pool.")
324
+ self._pool.shutdown(wait=False, cancel_futures=True)
325
+ logger.info("shut down worker pool success.")
326
+ return super().cleanup()#
uv.lock ADDED
The diff for this file is too large to render. See raw diff