init whispercpp transcirption

Files changed (16) hide show

.gitignore +0 -1
.python-version +1 -0
config.py +19 -0
pyproject.toml +19 -0
requirements.txt +114 -0
run_client.py +1 -1
transcribe/__pycache__/__init__.cpython-311.pyc +0 -0
transcribe/__pycache__/client.cpython-311.pyc +0 -0
transcribe/__pycache__/server.cpython-311.pyc +0 -0
transcribe/__pycache__/utils.cpython-311.pyc +0 -0
transcribe/__pycache__/vad.cpython-311.pyc +0 -0
transcribe/server/__init__.py +2 -0
transcribe/{server.py → server/base.py} +14 -315
transcribe/server/transcription.py +316 -0
transcribe/server/whispercpp.py +326 -0
uv.lock +0 -0

.gitignore CHANGED Viewed

@@ -1,7 +1,6 @@
 __pycache__/
 *.py[cod]
 *$py.class
 # C extensions
 *.so

 __pycache__/
 *.py[cod]
 *$py.class
 # C extensions
 *.so

.python-version ADDED Viewed

	@@ -0,0 +1 @@


1	+ 3.11

config.py ADDED Viewed

	@@ -0,0 +1,19 @@

+import pathlib
+BASE_DIR = pathlib.Path(__file__).parent
+MODEL_DIR = BASE_DIR / "moyoyo_asr_models"
+# 标点
+SENTENCE_END_MARKERS =  ['.', '!', '?', '。', '！', '？', ';', '；', ':', '：']
+PAUSE_END_MARKERS = [',', '，', '、']
+# whisper推理参数
+WHISPER_PROMPT_ZH = "以下是简体中文普通话的句子。"
+MAX_LENTH_ZH = 3
+WHISPER_PROMPT_EN = "The following is an English sentence."
+MAX_LENGTH_EN= 3
+WHISPER_MODEL = 'medium-q5_0'

pyproject.toml ADDED Viewed

	@@ -0,0 +1,19 @@

+[project]
+name = "trans"
+version = "0.1.0"
+description = "Add your description here"
+readme = "README.md"
+requires-python = ">=3.11"
+dependencies = [
+    "av>=14.2.0",
+    "librosa>=0.11.0",
+    "numpy>=2.1.3",
+    "onnxruntime>=1.21.0",
+    "pyaudio>=0.2.14",
+    "setuptools>=78.1.0",
+    "soundfile>=0.13.1",
+    "torch>=2.6.0",
+    "tqdm>=4.67.1",
+    "websocket-client>=1.8.0",
+    "websockets>=15.0.1",
+]

requirements.txt ADDED Viewed

	@@ -0,0 +1,114 @@

+# This file was autogenerated by uv via the following command:
+#    uv pip compile pyproject.toml -o requirements.txt
+audioread==3.0.1
+    # via librosa
+av==14.3.0
+    # via trans (pyproject.toml)
+certifi==2025.1.31
+    # via requests
+cffi==1.17.1
+    # via soundfile
+charset-normalizer==3.4.1
+    # via requests
+coloredlogs==15.0.1
+    # via onnxruntime
+decorator==5.2.1
+    # via librosa
+filelock==3.18.0
+    # via torch
+flatbuffers==25.2.10
+    # via onnxruntime
+fsspec==2025.3.2
+    # via torch
+humanfriendly==10.0
+    # via coloredlogs
+idna==3.10
+    # via requests
+jinja2==3.1.6
+    # via torch
+joblib==1.4.2
+    # via
+    #   librosa
+    #   scikit-learn
+lazy-loader==0.4
+    # via librosa
+librosa==0.11.0
+    # via trans (pyproject.toml)
+llvmlite==0.44.0
+    # via numba
+markupsafe==3.0.2
+    # via jinja2
+mpmath==1.3.0
+    # via sympy
+msgpack==1.1.0
+    # via librosa
+networkx==3.4.2
+    # via torch
+numba==0.61.0
+    # via librosa
+numpy==2.1.3
+    # via
+    #   trans (pyproject.toml)
+    #   librosa
+    #   numba
+    #   onnxruntime
+    #   scikit-learn
+    #   scipy
+    #   soundfile
+    #   soxr
+onnxruntime==1.21.0
+    # via trans (pyproject.toml)
+packaging==24.2
+    # via
+    #   lazy-loader
+    #   onnxruntime
+    #   pooch
+platformdirs==4.3.7
+    # via pooch
+pooch==1.8.2
+    # via librosa
+protobuf==6.30.2
+    # via onnxruntime
+pyaudio==0.2.14
+    # via trans (pyproject.toml)
+pycparser==2.22
+    # via cffi
+requests==2.32.3
+    # via pooch
+scikit-learn==1.6.1
+    # via librosa
+scipy==1.15.2
+    # via
+    #   librosa
+    #   scikit-learn
+setuptools==78.1.0
+    # via trans (pyproject.toml)
+soundfile==0.13.1
+    # via
+    #   trans (pyproject.toml)
+    #   librosa
+soxr==0.5.0.post1
+    # via librosa
+sympy==1.13.1
+    # via
+    #   onnxruntime
+    #   torch
+threadpoolctl==3.6.0
+    # via scikit-learn
+torch==2.6.0
+    # via trans (pyproject.toml)
+tqdm==4.67.1
+    # via trans (pyproject.toml)
+typing-extensions==4.13.1
+    # via
+    #   librosa
+    #   torch
+urllib3==2.3.0
+    # via requests
+websocket-client==1.8.0
+    # via trans (pyproject.toml)
+websockets==15.0.1
+    # via trans (pyproject.toml)
+llama-cpp-python
+pywhispercpp

run_client.py CHANGED Viewed

@@ -2,7 +2,7 @@ from transcribe.client import TranscriptionClient
 client = TranscriptionClient(
     "localhost",
-    9000,
     lang="zh",
     save_output_recording=False,  # Only used for microphone input, False by Default
     output_recording_filename="./output_recording.wav",  # Only used for microphone input

 client = TranscriptionClient(
     "localhost",
+    9090,
     lang="zh",
     save_output_recording=False,  # Only used for microphone input, False by Default
     output_recording_filename="./output_recording.wav",  # Only used for microphone input

transcribe/__pycache__/__init__.cpython-311.pyc DELETED Viewed

Binary file (183 Bytes)

transcribe/__pycache__/client.cpython-311.pyc DELETED Viewed

Binary file (39 kB)

transcribe/__pycache__/server.cpython-311.pyc DELETED Viewed

Binary file (36 kB)

transcribe/__pycache__/utils.cpython-311.pyc DELETED Viewed

Binary file (4.64 kB)

transcribe/__pycache__/vad.cpython-311.pyc DELETED Viewed

Binary file (9.36 kB)

transcribe/server/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+
2	+ from .transcription import TranscriptionServer

transcribe/{server.py → server/base.py} RENAMED Viewed

@@ -1,324 +1,17 @@
-import functools
 import json
 import logging
 import pathlib
 import threading
 import time
-from enum import Enum
-from typing import List, Optional
 import librosa
 import numpy as np
 import soundfile
 from pywhispercpp.model import Model
-from websockets.exceptions import ConnectionClosed
-from websockets.sync.server import serve
-from transcribe.vad import VoiceActivityDetector
 logging.basicConfig(level=logging.INFO)
-class ClientManager:
-    def __init__(self, max_clients=4, max_connection_time=600):
-        """
-        Initializes the ClientManager with specified limits on client connections and connection durations.
-        Args:
-            max_clients (int, optional): The maximum number of simultaneous client connections allowed. Defaults to 4.
-            max_connection_time (int, optional): The maximum duration (in seconds) a client can stay connected. Defaults
-                                                 to 600 seconds (10 minutes).
-        """
-        self.clients = {}
-        self.start_times = {}
-        self.max_clients = max_clients
-        self.max_connection_time = max_connection_time
-    def add_client(self, websocket, client):
-        """
-        Adds a client and their connection start time to the tracking dictionaries.
-        Args:
-            websocket: The websocket associated with the client to add.
-            client: The client object to be added and tracked.
-        """
-        self.clients[websocket] = client
-        self.start_times[websocket] = time.time()
-    def get_client(self, websocket):
-        """
-        Retrieves a client associated with the given websocket.
-        Args:
-            websocket: The websocket associated with the client to retrieve.
-        Returns:
-            The client object if found, False otherwise.
-        """
-        if websocket in self.clients:
-            return self.clients[websocket]
-        return False
-    def remove_client(self, websocket):
-        """
-        Removes a client and their connection start time from the tracking dictionaries. Performs cleanup on the
-        client if necessary.
-        Args:
-            websocket: The websocket associated with the client to be removed.
-        """
-        client = self.clients.pop(websocket, None)
-        if client:
-            client.cleanup()
-        self.start_times.pop(websocket, None)
-    def get_wait_time(self):
-        """
-        Calculates the estimated wait time for new clients based on the remaining connection times of current clients.
-        Returns:
-            The estimated wait time in minutes for new clients to connect. Returns 0 if there are available slots.
-        """
-        wait_time = None
-        for start_time in self.start_times.values():
-            current_client_time_remaining = self.max_connection_time - (time.time() - start_time)
-            if wait_time is None or current_client_time_remaining < wait_time:
-                wait_time = current_client_time_remaining
-        return wait_time / 60 if wait_time is not None else 0
-    def is_server_full(self, websocket, options):
-        """
-        Checks if the server is at its maximum client capacity and sends a wait message to the client if necessary.
-        Args:
-            websocket: The websocket of the client attempting to connect.
-            options: A dictionary of options that may include the client's unique identifier.
-        Returns:
-            True if the server is full, False otherwise.
-        """
-        if len(self.clients) >= self.max_clients:
-            wait_time = self.get_wait_time()
-            response = {"uid": options["uid"], "status": "WAIT", "message": wait_time}
-            websocket.send(json.dumps(response))
-            return True
-        return False
-    def is_client_timeout(self, websocket):
-        """
-        Checks if a client has exceeded the maximum allowed connection time and disconnects them if so, issuing a warning.
-        Args:
-            websocket: The websocket associated with the client to check.
-        Returns:
-            True if the client's connection time has exceeded the maximum limit, False otherwise.
-        """
-        elapsed_time = time.time() - self.start_times[websocket]
-        if elapsed_time >= self.max_connection_time:
-            self.clients[websocket].disconnect()
-            logging.warning(f"Client with uid '{self.clients[websocket].client_uid}' disconnected due to overtime.")
-            return True
-        return False
-class BackendType(Enum):
-    PYWHISPERCPP = "pywhispercpp"
-    @staticmethod
-    def valid_types() -> List[str]:
-        return [backend_type.value for backend_type in BackendType]
-    @staticmethod
-    def is_valid(backend: str) -> bool:
-        return backend in BackendType.valid_types()
-    def is_pywhispercpp(self) -> bool:
-        return self == BackendType.PYWHISPERCPP
-class TranscriptionServer:
-    RATE = 16000
-    def __init__(self):
-        self.client_manager = None
-        self.no_voice_activity_chunks = 0
-        self.single_model = False
-    def initialize_client(
-            self, websocket, options
-    ):
-        client: Optional[ServeClientBase] = None
-        if self.backend.is_pywhispercpp():
-            client = ServeClientWhisperCPP(
-                websocket,
-                language=options["language"],
-                client_uid=options["uid"],
-                single_model=self.single_model,
-            )
-            logging.info("Running pywhispercpp backend.")
-        if client is None:
-            raise ValueError(f"Backend type {self.backend.value} not recognised or not handled.")
-        self.client_manager.add_client(websocket, client)
-    def get_audio_from_websocket(self, websocket):
-        """
-        Receives audio buffer from websocket and creates a numpy array out of it.
-        Args:
-            websocket: The websocket to receive audio from.
-        Returns:
-            A numpy array containing the audio.
-        """
-        frame_data = websocket.recv()
-        if frame_data == b"END_OF_AUDIO":
-            return False
-        return np.frombuffer(frame_data, dtype=np.float32)
-    def handle_new_connection(self, websocket):
-        try:
-            logging.info("New client connected")
-            options = websocket.recv()
-            options = json.loads(options)
-            if self.client_manager is None:
-                max_clients = options.get('max_clients', 4)
-                max_connection_time = options.get('max_connection_time', 600)
-                self.client_manager = ClientManager(max_clients, max_connection_time)
-            if self.client_manager.is_server_full(websocket, options):
-                websocket.close()
-                return False  # Indicates that the connection should not continue
-            if self.backend.is_pywhispercpp():
-                self.vad_detector = VoiceActivityDetector(frame_rate=self.RATE)
-            self.initialize_client(websocket, options)
-            return True
-        except json.JSONDecodeError:
-            logging.error("Failed to decode JSON from client")
-            return False
-        except ConnectionClosed:
-            logging.info("Connection closed by client")
-            return False
-        except Exception as e:
-            logging.error(f"Error during new connection initialization: {str(e)}")
-            return False
-    def process_audio_frames(self, websocket):
-        frame_np = self.get_audio_from_websocket(websocket)
-        client = self.client_manager.get_client(websocket)
-        # TODO Vad has some problem, it will be blocking process loop
-        # if frame_np is False:
-        #     if self.backend.is_pywhispercpp():
-        #         client.set_eos(True)
-        #     return False
-        # if self.backend.is_pywhispercpp():
-        #     voice_active = self.voice_activity(websocket, frame_np)
-        #     if voice_active:
-        #         self.no_voice_activity_chunks = 0
-        #         client.set_eos(False)
-        #     if self.use_vad and not voice_active:
-        #         return True
-        client.add_frames(frame_np)
-        return True
-    def recv_audio(self,
-                   websocket,
-                   backend: BackendType = BackendType.PYWHISPERCPP):
-        self.backend = backend
-        if not self.handle_new_connection(websocket):
-            return
-        try:
-            while not self.client_manager.is_client_timeout(websocket):
-                if not self.process_audio_frames(websocket):
-                    break
-        except ConnectionClosed:
-            logging.info("Connection closed by client")
-        except Exception as e:
-            logging.error(f"Unexpected error: {str(e)}")
-        finally:
-            if self.client_manager.get_client(websocket):
-                self.cleanup(websocket)
-                websocket.close()
-            del websocket
-    def run(self,
-            host,
-            port=9090,
-            backend="pywhispercpp"):
-        """
-        Run the transcription server.
-        Args:
-            host (str): The host address to bind the server.
-            port (int): The port number to bind the server.
-        """
-        if not BackendType.is_valid(backend):
-            raise ValueError(f"{backend} is not a valid backend type. Choose backend from {BackendType.valid_types()}")
-        with serve(
-                functools.partial(
-                    self.recv_audio,
-                    backend=BackendType(backend),
-                ),
-                host,
-                port
-        ) as server:
-            server.serve_forever()
-    def voice_activity(self, websocket, frame_np):
-        """
-        Evaluates the voice activity in a given audio frame and manages the state of voice activity detection.
-        This method uses the configured voice activity detection (VAD) model to assess whether the given audio frame
-        contains speech. If the VAD model detects no voice activity for more than three consecutive frames,
-        it sets an end-of-speech (EOS) flag for the associated client. This method aims to efficiently manage
-        speech detection to improve subsequent processing steps.
-        Args:
-            websocket: The websocket associated with the current client. Used to retrieve the client object
-                    from the client manager for state management.
-            frame_np (numpy.ndarray): The audio frame to be analyzed. This should be a NumPy array containing
-                                    the audio data for the current frame.
-        Returns:
-            bool: True if voice activity is detected in the current frame, False otherwise. When returning False
-                after detecting no voice activity for more than three consecutive frames, it also triggers the
-                end-of-speech (EOS) flag for the client.
-        """
-        if not self.vad_detector(frame_np):
-            self.no_voice_activity_chunks += 1
-            if self.no_voice_activity_chunks > 3:
-                client = self.client_manager.get_client(websocket)
-                if not client.eos:
-                    client.set_eos(True)
-                time.sleep(0.1)  # Sleep 100m; wait some voice activity.
-            return False
-        return True
-    def cleanup(self, websocket):
-        """
-        Cleans up resources associated with a given client's websocket.
-        Args:
-            websocket: The websocket associated with the client to be cleaned up.
-        """
-        if self.client_manager.get_client(websocket):
-            self.client_manager.remove_client(websocket)
 class ServeClientBase(object):
     RATE = 16000
     SERVER_READY = "SERVER_READY"
@@ -442,6 +135,7 @@ class ServeClientBase(object):
             segments = self.transcript.copy()
         if last_segment is not None:
             segments = segments + [last_segment]
         return segments
     def get_audio_chunk_duration(self, input_bytes):
@@ -549,10 +243,8 @@ class ServeClientWhisperCPP(ServeClientBase):
         """
         Instantiates a new model, sets it as the transcriber and does warmup if desired.
         """
-        model = 'medium-q5_0'
-        here = pathlib.Path(__file__)
-        models_dir = f'{here.parent.parent / "moyoyo_asr_models"}'
-        self.transcriber = Model(model=model, models_dir=models_dir)
         if warmup:
             self.warmup()
@@ -610,8 +302,15 @@ class ServeClientWhisperCPP(ServeClientBase):
             prompt = '以下是简体中文普通话的句子。'
         else:
             prompt = 'The following is an English sentence.'
-        segments = self.transcriber.transcribe(mel, language='zh', initial_prompt=prompt, print_progress=False)
         text = []
         for segment in segments:
             content = segment.text

 import json
 import logging
 import pathlib
 import threading
 import time
+import config
 import librosa
 import numpy as np
 import soundfile
 from pywhispercpp.model import Model
 logging.basicConfig(level=logging.INFO)
 class ServeClientBase(object):
     RATE = 16000
     SERVER_READY = "SERVER_READY"
             segments = self.transcript.copy()
         if last_segment is not None:
             segments = segments + [last_segment]
+        logging.info(f"{segments}")
         return segments
     def get_audio_chunk_duration(self, input_bytes):
         """
         Instantiates a new model, sets it as the transcriber and does warmup if desired.
         """
+        self.transcriber = Model(model=config.WHISPER_MODEL, models_dir=config.MODEL_DIR)
         if warmup:
             self.warmup()
             prompt = '以下是简体中文普通话的句子。'
         else:
             prompt = 'The following is an English sentence.'
+        segments = self.transcriber.transcribe(
+            mel,
+            language=self.language,
+            initial_prompt=prompt,
+            token_timestamps=True,
+            # max_len=max_len,
+            print_progress=False
+        )
         text = []
         for segment in segments:
             content = segment.text

transcribe/server/transcription.py ADDED Viewed

	@@ -0,0 +1,316 @@

+import logging
+import time
+import functools
+import json
+import logging
+import time
+from enum import Enum
+from typing import List, Optional
+import numpy as np
+from .base import ServeClientBase, ServeClientWhisperCPP
+from .whispercpp import PyWhiperCppServe
+from ..vad import VoiceActivityDetector
+from websockets.exceptions import ConnectionClosed
+from websockets.sync.server import serve
+logging.basicConfig(level=logging.INFO)
+class ClientManager:
+    def __init__(self, max_clients=4, max_connection_time=600):
+        """
+        Initializes the ClientManager with specified limits on client connections and connection durations.
+        Args:
+            max_clients (int, optional): The maximum number of simultaneous client connections allowed. Defaults to 4.
+            max_connection_time (int, optional): The maximum duration (in seconds) a client can stay connected. Defaults
+                                                 to 600 seconds (10 minutes).
+        """
+        self.clients = {}
+        self.start_times = {}
+        self.max_clients = max_clients
+        self.max_connection_time = max_connection_time
+    def add_client(self, websocket, client):
+        """
+        Adds a client and their connection start time to the tracking dictionaries.
+        Args:
+            websocket: The websocket associated with the client to add.
+            client: The client object to be added and tracked.
+        """
+        self.clients[websocket] = client
+        self.start_times[websocket] = time.time()
+    def get_client(self, websocket):
+        """
+        Retrieves a client associated with the given websocket.
+        Args:
+            websocket: The websocket associated with the client to retrieve.
+        Returns:
+            The client object if found, False otherwise.
+        """
+        if websocket in self.clients:
+            return self.clients[websocket]
+        return False
+    def remove_client(self, websocket):
+        """
+        Removes a client and their connection start time from the tracking dictionaries. Performs cleanup on the
+        client if necessary.
+        Args:
+            websocket: The websocket associated with the client to be removed.
+        """
+        client = self.clients.pop(websocket, None)
+        if client:
+            client.cleanup()
+        self.start_times.pop(websocket, None)
+    def get_wait_time(self):
+        """
+        Calculates the estimated wait time for new clients based on the remaining connection times of current clients.
+        Returns:
+            The estimated wait time in minutes for new clients to connect. Returns 0 if there are available slots.
+        """
+        wait_time = None
+        for start_time in self.start_times.values():
+            current_client_time_remaining = self.max_connection_time - (time.time() - start_time)
+            if wait_time is None or current_client_time_remaining < wait_time:
+                wait_time = current_client_time_remaining
+        return wait_time / 60 if wait_time is not None else 0
+    def is_server_full(self, websocket, options):
+        """
+        Checks if the server is at its maximum client capacity and sends a wait message to the client if necessary.
+        Args:
+            websocket: The websocket of the client attempting to connect.
+            options: A dictionary of options that may include the client's unique identifier.
+        Returns:
+            True if the server is full, False otherwise.
+        """
+        if len(self.clients) >= self.max_clients:
+            wait_time = self.get_wait_time()
+            response = {"uid": options["uid"], "status": "WAIT", "message": wait_time}
+            websocket.send(json.dumps(response))
+            return True
+        return False
+    def is_client_timeout(self, websocket):
+        """
+        Checks if a client has exceeded the maximum allowed connection time and disconnects them if so, issuing a warning.
+        Args:
+            websocket: The websocket associated with the client to check.
+        Returns:
+            True if the client's connection time has exceeded the maximum limit, False otherwise.
+        """
+        elapsed_time = time.time() - self.start_times[websocket]
+        if elapsed_time >= self.max_connection_time:
+            self.clients[websocket].disconnect()
+            logging.warning(f"Client with uid '{self.clients[websocket].client_uid}' disconnected due to overtime.")
+            return True
+        return False
+class BackendType(Enum):
+    PYWHISPERCPP = "pywhispercpp"
+    @staticmethod
+    def valid_types() -> List[str]:
+        return [backend_type.value for backend_type in BackendType]
+    @staticmethod
+    def is_valid(backend: str) -> bool:
+        return backend in BackendType.valid_types()
+    def is_pywhispercpp(self) -> bool:
+        return self == BackendType.PYWHISPERCPP
+class TranscriptionServer:
+    RATE = 16000
+    def __init__(self):
+        self.client_manager = None
+        self.no_voice_activity_chunks = 0
+        self.single_model = False
+    def initialize_client(
+            self, websocket, options
+    ):
+        client: Optional[ServeClientBase] = None
+        if self.backend.is_pywhispercpp():
+            client = PyWhiperCppServe(
+                websocket,
+                language=options["language"],
+                client_uid=options["uid"],
+            )
+            logging.info("Running pywhispercpp backend.")
+        if client is None:
+            raise ValueError(f"Backend type {self.backend.value} not recognised or not handled.")
+        self.client_manager.add_client(websocket, client)
+    def get_audio_from_websocket(self, websocket):
+        """
+        Receives audio buffer from websocket and creates a numpy array out of it.
+        Args:
+            websocket: The websocket to receive audio from.
+        Returns:
+            A numpy array containing the audio.
+        """
+        frame_data = websocket.recv()
+        if frame_data == b"END_OF_AUDIO":
+            return False
+        return np.frombuffer(frame_data, dtype=np.float32)
+    def handle_new_connection(self, websocket):
+        try:
+            logging.info("New client connected")
+            options = websocket.recv()
+            options = json.loads(options)
+            if self.client_manager is None:
+                max_clients = options.get('max_clients', 4)
+                max_connection_time = options.get('max_connection_time', 600)
+                self.client_manager = ClientManager(max_clients, max_connection_time)
+            if self.client_manager.is_server_full(websocket, options):
+                websocket.close()
+                return False  # Indicates that the connection should not continue
+            if self.backend.is_pywhispercpp():
+                self.vad_detector = VoiceActivityDetector(frame_rate=self.RATE)
+            self.initialize_client(websocket, options)
+            return True
+        except json.JSONDecodeError:
+            logging.error("Failed to decode JSON from client")
+            return False
+        except ConnectionClosed:
+            logging.info("Connection closed by client")
+            return False
+        except Exception as e:
+            logging.error(f"Error during new connection initialization: {str(e)}")
+            return False
+    def process_audio_frames(self, websocket):
+        frame_np = self.get_audio_from_websocket(websocket)
+        client = self.client_manager.get_client(websocket)
+        # TODO Vad has some problem, it will be blocking process loop
+        # if frame_np is False:
+        #     if self.backend.is_pywhispercpp():
+        #         client.set_eos(True)
+        #     return False
+        # if self.backend.is_pywhispercpp():
+        #     voice_active = self.voice_activity(websocket, frame_np)
+        #     if voice_active:
+        #         self.no_voice_activity_chunks = 0
+        #         client.set_eos(False)
+        #     if self.use_vad and not voice_active:
+        #         return True
+        client.add_frames(frame_np)
+        return True
+    def recv_audio(self,
+                   websocket,
+                   backend: BackendType = BackendType.PYWHISPERCPP):
+        self.backend = backend
+        if not self.handle_new_connection(websocket):
+            return
+        try:
+            while not self.client_manager.is_client_timeout(websocket):
+                if not self.process_audio_frames(websocket):
+                    break
+        except ConnectionClosed:
+            logging.info("Connection closed by client")
+        except Exception as e:
+            logging.error(f"Unexpected error: {str(e)}")
+        finally:
+            if self.client_manager.get_client(websocket):
+                self.cleanup(websocket)
+                websocket.close()
+            del websocket
+    def run(self,
+            host,
+            port=9090,
+            backend="pywhispercpp"):
+        """
+        Run the transcription server.
+        Args:
+            host (str): The host address to bind the server.
+            port (int): The port number to bind the server.
+        """
+        if not BackendType.is_valid(backend):
+            raise ValueError(f"{backend} is not a valid backend type. Choose backend from {BackendType.valid_types()}")
+        with serve(
+                functools.partial(
+                    self.recv_audio,
+                    backend=BackendType(backend),
+                ),
+                host,
+                port
+        ) as server:
+            server.serve_forever()
+    def voice_activity(self, websocket, frame_np):
+        """
+        Evaluates the voice activity in a given audio frame and manages the state of voice activity detection.
+        This method uses the configured voice activity detection (VAD) model to assess whether the given audio frame
+        contains speech. If the VAD model detects no voice activity for more than three consecutive frames,
+        it sets an end-of-speech (EOS) flag for the associated client. This method aims to efficiently manage
+        speech detection to improve subsequent processing steps.
+        Args:
+            websocket: The websocket associated with the current client. Used to retrieve the client object
+                    from the client manager for state management.
+            frame_np (numpy.ndarray): The audio frame to be analyzed. This should be a NumPy array containing
+                                    the audio data for the current frame.
+        Returns:
+            bool: True if voice activity is detected in the current frame, False otherwise. When returning False
+                after detecting no voice activity for more than three consecutive frames, it also triggers the
+                end-of-speech (EOS) flag for the client.
+        """
+        if not self.vad_detector(frame_np):
+            self.no_voice_activity_chunks += 1
+            if self.no_voice_activity_chunks > 3:
+                client = self.client_manager.get_client(websocket)
+                if not client.eos:
+                    client.set_eos(True)
+                time.sleep(0.1)  # Sleep 100m; wait some voice activity.
+            return False
+        return True
+    def cleanup(self, websocket):
+        """
+        Cleans up resources associated with a given client's websocket.
+        Args:
+            websocket: The websocket associated with the client to be cleaned up.
+        """
+        if self.client_manager.get_client(websocket):
+            self.client_manager.remove_client(websocket)

transcribe/server/whispercpp.py ADDED Viewed

	@@ -0,0 +1,326 @@

+from .base import ServeClientBase
+from pywhispercpp.model import Model
+import soundfile
+from concurrent.futures import ProcessPoolExecutor as Pool
+import numpy as np
+from logging import getLogger
+from difflib import SequenceMatcher
+import collections
+import config
+import time
+import json
+import threading
+logger = getLogger("Pywhisper")
+class TripleTextBuffer:
+    def __init__(self, size=2):
+        self.history = collections.deque(maxlen=size)
+    def _clean(self):
+        self.history.clear()
+    def add_entry(self, text, index):
+        """
+        text: 文本
+        index: 当前buffer的相对下标 数组索引
+        """
+        self.history.append((text, index))
+    def get_final_index(self, similarity_threshold=0.7):
+        """根据文本变化，返回可靠的标点的buffer的位置下标"""
+        if len(self.history) < 2:
+            return None
+        # 获取三次的文本
+        text1, _ = self.history[0]
+        text2, idx2 = self.history[1]
+        # text3, idx3 = self.history[2]
+        # 计算变化程度
+        sim_12 = self.text_similarity(text1, text2)
+        # print("比较： ", text1, text2," => ", sim_12)
+        # sim_23 = self.text_similarity(text2, text3)
+        if sim_12 >= similarity_threshold:
+            self._clean()
+            return idx2
+        return None
+    @staticmethod
+    def text_similarity(text1, text2):
+        return SequenceMatcher(None, text1, text2).ratio()
+class SegmentManager:
+    def __init__(self) -> None:
+        self._commited_segments = [] # 确定后的段落
+        self._commited_short_sentences = [] # 确定后的序列
+        self._temp_string = "" # 存储当前临时的文本字符串，直到以句号结尾
+    def handle(self, string):
+        self._temp_string = string
+        return self
+    @property
+    def short_sentence(self) -> str:
+        return "".join(self._commited_short_sentences)
+    @property
+    def segment(self):
+        return self._commited_segments[-1] if len(self._commited_segments) > 0 else ""
+    def get_seg_id(self):
+        return len(self._commited_segments)
+    @property
+    def string(self):
+        return self._temp_string
+    def commit_short_sentence(self):
+        """将临时字符串 提交到临时短句"""
+        self._commited_short_sentences.append(self._temp_string)
+        self._temp_string = ""
+    def commit_segment(self):
+        """将短句 合并 到长句中"""
+        self._commited_segments.append(self.short_sentence)
+        self._commited_short_sentences = []
+    def commit(self, is_end_sentence=False):
+        """
+        当需要切掉的音频部分的时候，将句子提交到短句队列中，并移除临时字符串
+        当完成一个整句的时候提交到段落中
+        """
+        self.commit_short_sentence()
+        if is_end_sentence:
+            self.commit_segment()
+class PywhisperInference:
+    model = None
+    @classmethod
+    def initializer(cls, warmup=True):
+        models_dir = config.MODEL_DIR.as_posix()
+        cls.model = Model(
+            model=config.WHISPER_MODEL,
+            models_dir=models_dir,
+            print_realtime=False,
+            print_progress=False,
+            print_timestamps=False,
+        )
+        if warmup:
+            cls.warmup()
+    @classmethod
+    def warmup(cls, warmup_steps=1):
+        mel, _, = soundfile.read("assets/jfk.flac")
+        for _ in range(warmup_steps):
+            cls.model.transcribe(mel, print_progress=False)
+    @staticmethod
+    def config_language(language):
+        if language == "zh":
+            return config.MAX_LENTH_ZH, config.WHISPER_PROMPT_ZH
+        elif language == "en":
+            return config.MAX_LENTH_ZH, config.WHISPER_PROMPT_ZH
+        raise ValueError(f"Unsupported language : {language}")
+    @classmethod
+    def inference(cls, audio_buffer, language):
+        max_len, prompt = cls.config_language(language)
+        audio_buffer = np.frombuffer(audio_buffer, dtype=np.float32)
+        return cls.model.transcribe(
+            audio_buffer,
+            initial_prompt=prompt,
+            language=language,
+            token_timestamps=True,
+            max_len=max_len
+        )
+class PyWhiperCppServe(ServeClientBase):
+    def __init__(self, websocket, language=None, client_uid=None,):
+        super().__init__(client_uid, websocket)
+        self.language = language
+        # 设置观察字符串 对比上下次的文字来判断字符串的输出是否固定
+        self._text_buffer = TripleTextBuffer()
+        # 存储转录数据
+        self._segment_manager = SegmentManager()
+        self.lock = threading.Lock()
+        self.frames_np = None
+        self.sample_rate = 16000
+        self._pool = Pool(
+            max_workers=1, initializer=PywhisperInference.initializer)
+        logger.info('Create a thread to process audio.')
+        self.trans_thread = threading.Thread(target=self.speech_to_text)
+        self.trans_thread.start()
+        self.websocket.send(json.dumps({
+            "uid": self.client_uid,
+            "message": self.SERVER_READY,
+            "backend": "pywhispercpp"
+        }))
+    def add_frames(self, frame_np):
+        with self.lock:
+            if self.frames_np is None:
+                self.frames_np = frame_np.copy()
+            else:
+                self.frames_np = np.append(self.frames_np,frame_np)
+    def update_audio_buffer(self, last_offset):
+        with self.lock:
+            self.frames_np = self.frames_np[last_offset:]
+    def transcribe_audio(self, audio_buffer):
+        """
+         Transcribe the audio chunk and send the results to the client.
+        Args:
+            audio_buffer (np.array): The audio chunk to transcribe.
+        """
+        fut = self._pool.submit(
+            PywhisperInference.inference, audio_buffer.tobytes(), self.language)
+        return fut.result()
+    def _segments_split(self, segments, audio_buffer: np.ndarray):
+        """根据左边第一个标点符号来将序列拆分成 观察段 和 剩余部分"""
+        left_watch_sequences = []
+        left_watch_idx = 0
+        right_watch_sequences = []
+        is_end = False
+        if (len(audio_buffer) / self.sample_rate) < 10:
+            # 低于10s 使用短句符号比如逗号作为判断依据
+            markers = config.PAUSE_END_MARKERS
+            is_end = False
+        else:
+            # 使用句号 问好等长句结尾符号作为判断
+            markers = config.SENTENCE_END_MARKERS
+            is_end = True
+        for idx, seg in enumerate(segments):
+            left_watch_sequences.append(seg)
+            if seg.text in markers:
+                seg_index = int(seg.t1 / 100 * self.sample_rate)
+                rest_buffer_duration = (len(audio_buffer) - seg_index) / self.sample_rate
+                # is_end = any(i in seg.text for i  in config.SENTENCE_END_MARKERS)
+                right_watch_sequences = segments[min(idx+1, len(segments)):]
+                if rest_buffer_duration >= 1.5:
+                    left_watch_idx = seg_index
+                break
+        return left_watch_idx, left_watch_sequences, right_watch_sequences, is_end
+    def analysis_segments(self, segments,  audio_buffer: np.ndarray):
+          # 找到第一个标点符号作为锚点 左边为确认段，右边为观察段，
+        #   当左边确认后，右边段才会进入观察
+        #   当左边确认后，会从缓冲区中删除对应的buffer，减少下次输入的数据量
+        left_watch_idx, left_watch_sequences, right_watch_sequences, is_end_sentence = self._segments_split(segments, audio_buffer)
+        left_watch_string = "".join(i.text for i in left_watch_sequences)
+        right_watch_string = "".join(i.text for i in right_watch_sequences)
+        if left_watch_idx != 0:
+            # 将观察字符串临时存储
+            self._text_buffer.add_entry(left_watch_string, left_watch_idx)
+            audio_cut_index = self._text_buffer.get_final_index()
+            if audio_cut_index:
+                return audio_cut_index, left_watch_string, right_watch_string, is_end_sentence
+        return None, left_watch_string, right_watch_string, is_end_sentence
+    def speech_to_text(self):
+        while True:
+            if self.exit:
+                logger.info("Exiting speech to text thread")
+                self._pool.shutdown(wait=False, cancel_futures=True)
+                break
+            if self.frames_np is None:
+                time.sleep(0.02)  # wait for any audio to arrive
+                continue
+            audio_buffer = self.get_audio_chunk_for_processing()
+            # logger.info(f"[pywhispercpp:] Processing audio with duration: {len(audio_buffer)}")
+            # segments = self.transcribe_audio(audio_buffer)
+            try:
+                logger.info(f"[pywhispercpp:] Processing audio with duration: {len(audio_buffer)}")
+                segments = self.transcribe_audio(audio_buffer)
+            except KeyboardInterrupt:
+                break
+            except Exception as e:
+                logger.error(f"[ERROR]: {e}")
+            else:
+                self.handle_transcription_output(segments, audio_buffer)
+    def handle_transcription_output(self, segments, audio_buffer):
+        texts  = "".join(i.text for i in segments)
+        self._segment_manager.handle(texts)
+         # 分析句子
+        last_cut_index, left_string, right_string, is_end_sentence = self.analysis_segments(segments, audio_buffer)
+        # print(last_cut_index, left_string, right_string, is_end_sentence)
+        if last_cut_index:
+            self.update_audio_buffer(last_cut_index)
+            # 句子或者短句的提交
+            self._segment_manager.handle(left_string).commit(is_end_sentence)
+            self._segment_manager.handle(right_string)
+        if is_end_sentence and last_cut_index:
+            message = self._segment_manager.segment
+            seg_id = self._segment_manager.get_seg_id() - 1
+            # elapsed_time = time.time() - start_time
+            # formatted_time = f"{int(elapsed_time // 60):02}:{int(elapsed_time % 60):02}:{(elapsed_time % 1) * 1000:03.0f}"
+            print(seg_id, message,)
+            print(seg_id + 1, self._segment_manager.string,)
+        else:
+            seg_id = self._segment_manager.get_seg_id()
+            message = self._segment_manager.short_sentence + self._segment_manager.string
+            # print(self._segment_manager.__dict__)
+            # elapsed_time = time.time() - start_time
+            # formatted_time = f"{int(elapsed_time // 60):02}:{int(elapsed_time % 60):02}:{(elapsed_time % 1) * 1000:03.0f}"
+            print(seg_id, message)
+    def send_to_client(self, data_dict):
+        content = {
+            "uid": self.client_uid,
+            **data_dict
+        }
+        try:
+            self.websocket.send(
+                json.dumps(content)
+            )
+        except Exception as e:
+            logger.error(f"[ERROR]: Sending data to client: {e}")
+    def get_audio_chunk_for_processing(self):
+        if self.frames_np.shape[0] >= self.sample_rate * 1:
+            return self.frames_np.copy()
+        # 计算需要填充的样本数
+        padding_length = self.sample_rate * 1 - len(self.frames_np)
+        # 创建静音填充（零值）
+        silence = np.zeros(padding_length + int(0.01 * self.sample_rate), dtype=np.float32)
+        # 拼接原始音频和静音填充
+        padded_audio = np.concatenate([silence, self.frames_np])
+        return padded_audio.copy()
+    def cleanup(self):
+        logger.info("start shut down worker pool.")
+        self._pool.shutdown(wait=False, cancel_futures=True)
+        logger.info("shut down worker pool success.")
+        return super().cleanup()#

uv.lock ADDED Viewed

The diff for this file is too large to render. See raw diff