fix max speech duration bug

Files changed (6) hide show

config.py +2 -0
tests/audio_utils.py +54 -0
tests/test_vad.ipynb +129 -0
transcribe/helpers/vadprocessor.py +7 -7
transcribe/pipelines/pipe_vad.py +0 -1
transcribe/whisper_llm_serve.py +8 -5

config.py CHANGED Viewed

@@ -25,6 +25,8 @@ logging.getLogger().addHandler(console_handler)
 TEXT_THREHOLD = 6
 # 音频段的决策时间
 DESIGN_TIME_THREHOLD = 3
 BASE_DIR = pathlib.Path(__file__).parent
 MODEL_DIR = BASE_DIR / "moyoyo_asr_models"

 TEXT_THREHOLD = 6
 # 音频段的决策时间
 DESIGN_TIME_THREHOLD = 3
+# 最长语音时长
+MAX_SPEECH_DURATION_S = 15
 BASE_DIR = pathlib.Path(__file__).parent
 MODEL_DIR = BASE_DIR / "moyoyo_asr_models"

tests/audio_utils.py ADDED Viewed

	@@ -0,0 +1,54 @@

+import numpy as np
+import soundfile as sf
+import time
+def audio_stream_generator(audio_file_path, chunk_size=4096, simulate_realtime=True):
+    """
+    音频流生成器，从音频文件中读取数据并以流的方式输出
+    参数:
+        audio_file_path: 音频文件路径
+        chunk_size: 每个数据块的大小（采样点数）
+        simulate_realtime: 是否模拟实时流处理的速度
+    生成:
+        numpy.ndarray: 每次生成一个chunk_size大小的np.float32数据块
+    """
+    # 加载音频文件
+    audio_data, sample_rate = sf.read(audio_file_path)
+    # 确保音频数据是float32类型
+    if audio_data.dtype != np.float32:
+        audio_data = audio_data.astype(np.float32)
+    # 如果是立体声，转换为单声道
+    if len(audio_data.shape) > 1 and audio_data.shape[1] > 1:
+        audio_data = audio_data.mean(axis=1)
+    print(f"已加载音频文件: {audio_file_path}")
+    print(f"采样率: {sample_rate} Hz")
+    print(f"音频长度: {len(audio_data)/sample_rate:.2f} 秒")
+    # 计算每个块的时长（秒）
+    chunk_duration = chunk_size / sample_rate if simulate_realtime else 0
+    # 按块生成数据
+    audio_len = len(audio_data)
+    for pos in range(0, audio_len, chunk_size):
+        # 获取当前块
+        end_pos = min(pos + chunk_size, audio_len)
+        chunk = audio_data[pos:end_pos]
+        # 如果块大小不足，用0填充
+        if len(chunk) < chunk_size:
+            padded_chunk = np.zeros(chunk_size, dtype=np.float32)
+            padded_chunk[:len(chunk)] = chunk
+            chunk = padded_chunk
+        # 模拟实时处理的延迟
+        if simulate_realtime:
+            time.sleep(chunk_duration)
+        yield chunk
+    print("音频流处理完成")

tests/test_vad.ipynb ADDED Viewed

	@@ -0,0 +1,129 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from audio_utils import audio_stream_generator\n",
+    "import  IPython.display as ipd\n",
+    "import sys\n",
+    "sys.path.append(\"..\")\n",
+    "from transcribe.helpers.vadprocessor import FixedVADIterator\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "vac = FixedVADIterator(\n",
+    "                threshold=0.5,\n",
+    "                sampling_rate=16000,\n",
+    "                # speech_pad_ms=10\n",
+    "                min_silence_duration_ms = 100,\n",
+    "                # speech_pad_ms = 30,\n",
+    "                max_speech_duration_s=5.0,\n",
+    "                )\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "SAMPLE_FILE_PATH = \"/Users/david/Samples/Audio/zh/liyongle.wav\"\n",
+    "SAMPLING_RATE = 16000\n",
+    "\n",
+    "chunks_generator =  audio_stream_generator(SAMPLE_FILE_PATH, chunk_size=4096)\n",
+    "vac.reset_states()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "已加载音频文件: /Users/david/Samples/Audio/zh/liyongle.wav\n",
+      "采样率: 16000 Hz\n",
+      "音频长度: 64.00 秒\n",
+      "{'start': 3616}\n",
+      "{'end': 83968}\n",
+      "{'end': 164352}\n",
+      "{'end': 244736}\n",
+      "{'end': 325120}\n",
+      "{'end': 405504}\n",
+      "{'end': 485888}\n",
+      "{'end': 566272}\n",
+      "{'end': 624608}\n",
+      "{'start': 631328}\n",
+      "{'end': 691168}\n",
+      "{'start': 698912}\n",
+      "{'end': 779264}\n",
+      "{'end': 800736}\n",
+      "{'start': 805920}\n",
+      "{'end': 846816}\n",
+      "{'start': 855072}\n",
+      "{'end': 862176}\n",
+      "{'start': 864288}\n",
+      "{'end': 890336}\n",
+      "{'start': 893984}\n",
+      "{'end': 912352}\n",
+      "{'start': 917536}\n",
+      "{'end': 932320}\n",
+      "{'start': 939040}\n",
+      "{'end': 966112}\n",
+      "{'start': 970784}\n",
+      "{'end': 1015264}\n",
+      "{'start': 1019424}\n",
+      "音频流处理完成\n"
+     ]
+    }
+   ],
+   "source": [
+    "for chunk in chunks_generator:\n",
+    "    # vad_iterator.reset_states()\n",
+    "    # audio_buffer = np.append(audio_buffer, chunk)\n",
+    "    \n",
+    "    speech_dict = vac(chunk, return_seconds=False)\n",
+    "    if speech_dict:\n",
+    "        print(speech_dict)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.11"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

transcribe/helpers/vadprocessor.py CHANGED Viewed

@@ -155,7 +155,7 @@ class VADIteratorOnnx:
             raise ValueError('VADIterator does not support sampling rates other than [8000, 16000]')
         self.min_silence_samples = sampling_rate * min_silence_duration_ms / 1000
-        self.max_speech_samples = int(sampling_rate * max_speech_duration_s)
         self.speech_pad_samples = sampling_rate * speech_pad_ms / 1000
         self.reset_states()
@@ -184,7 +184,7 @@ class VADIteratorOnnx:
         self.current_sample += window_size_samples
         speech_prob = self.model(x, self.sampling_rate)[0,0]
-        # print(f"{self.current_sample/self.sampling_rate:.2f}: {speech_prob}")
         if (speech_prob >= self.threshold) and self.temp_end:
             self.temp_end = 0
@@ -196,11 +196,11 @@ class VADIteratorOnnx:
             self.start = speech_start
             return {'start': int(speech_start) if not return_seconds else round(speech_start / self.sampling_rate, 1)}
-        if (speech_prob >= self.threshold) and self.current_sample - self.start >= self.max_speech_samples:
-            if self.temp_end:
-                self.temp_end = 0
-            self.start = self.current_sample
-            return {'end': int(self.current_sample) if not return_seconds else round(self.current_sample / self.sampling_rate, 1)}
         if (speech_prob < self.threshold - 0.15) and self.triggered:
             if not self.temp_end:

             raise ValueError('VADIterator does not support sampling rates other than [8000, 16000]')
         self.min_silence_samples = sampling_rate * min_silence_duration_ms / 1000
+        # self.max_speech_samples = int(sampling_rate * max_speech_duration_s)
         self.speech_pad_samples = sampling_rate * speech_pad_ms / 1000
         self.reset_states()
         self.current_sample += window_size_samples
         speech_prob = self.model(x, self.sampling_rate)[0,0]
         if (speech_prob >= self.threshold) and self.temp_end:
             self.temp_end = 0
             self.start = speech_start
             return {'start': int(speech_start) if not return_seconds else round(speech_start / self.sampling_rate, 1)}
+        # if (speech_prob >= self.threshold) and self.current_sample - self.start >= self.max_speech_samples:
+        #     if self.temp_end:
+        #         self.temp_end = 0
+        #     self.start = self.current_sample
+        #     return {'end': int(self.current_sample) if not return_seconds else round(self.current_sample / self.sampling_rate, 1)}
         if (speech_prob < self.threshold - 0.15) and self.triggered:
             if not self.temp_end:

transcribe/pipelines/pipe_vad.py CHANGED Viewed

@@ -33,7 +33,6 @@ class VadPipe(BasePipe):
                 # speech_pad_ms=10
                 min_silence_duration_ms = 100,
                 # speech_pad_ms = 30,
-                max_speech_duration_s=20.0,
                 )
             cls.vac.reset_states()

                 # speech_pad_ms=10
                 min_silence_duration_ms = 100,
                 # speech_pad_ms = 30,
                 )
             cls.vac.reset_states()

transcribe/whisper_llm_serve.py CHANGED Viewed

@@ -145,7 +145,13 @@ class WhisperTranscriptionService:
                         self.frames_np_start_timestamp = time.time()
                     # 添加音频到音频缓冲区
                     self.frames_np = np.append(self.frames_np, frame_np)
-                    if speech_status == "END" and len(self.frames_np) > 0  and self.frames_np_start_timestamp:
                         time_diff = time.time() - self.frames_np_start_timestamp
                         if time_diff >= config.DESIGN_TIME_THREHOLD:
                             audio_array=self.frames_np.copy()
@@ -160,10 +166,7 @@ class WhisperTranscriptionService:
     def _transcription_processing_loop(self) -> None:
         """主转录处理循环"""
-        # loop_start_time = time.perf_counter()
-        # 1. 音频进入的时间戳长度低于3s和后面拼接起来
-        # 2. 超过25s后主动短句
         while not self._translate_thread_stop.is_set():

                         self.frames_np_start_timestamp = time.time()
                     # 添加音频到音频缓冲区
                     self.frames_np = np.append(self.frames_np, frame_np)
+                    if len(self.frames_np) >= self.sample_rate * config.MAX_SPEECH_DURATION_S:
+                        audio_array=self.frames_np.copy()
+                        self.full_segments_queue.appendleft(audio_array) # 根据时间是否满足三秒长度 来整合音频块
+                        self.frames_np_start_timestamp = time.time()
+                        self.frames_np = np.array([], dtype=np.float32)
+                    elif speech_status == "END" and len(self.frames_np) > 0 and self.frames_np_start_timestamp:
                         time_diff = time.time() - self.frames_np_start_timestamp
                         if time_diff >= config.DESIGN_TIME_THREHOLD:
                             audio_array=self.frames_np.copy()
     def _transcription_processing_loop(self) -> None:
         """主转录处理循环"""
+        frame_epoch = 1
         while not self._translate_thread_stop.is_set():