update to vad streaming

Files changed (5) hide show

transcribe/helpers/vadprocessor.py +262 -1
transcribe/pipelines/base.py +1 -0
transcribe/pipelines/pipe_vad.py +72 -18
transcribe/translatepipes.py +3 -3
transcribe/whisper_llm_serve.py +93 -58

transcribe/helpers/vadprocessor.py CHANGED Viewed

@@ -2,10 +2,15 @@ from copy import deepcopy
 from queue import Queue, Empty
 from time import time
 from config import VAD_MODEL_PATH
-# from silero_vad import load_silero_vad
 import numpy as np
 import onnxruntime
 class OnnxWrapper():
     def __init__(self, path, force_onnx_cpu=False):
@@ -178,6 +183,33 @@ class VADIteratorOnnx:
 class VadV2:
     def __init__(self,
                  threshold: float = 0.5,
@@ -269,6 +301,235 @@ class VadV2:
         return None
 class VadProcessor:
     def __init__(

 from queue import Queue, Empty
 from time import time
 from config import VAD_MODEL_PATH
+from silero_vad import load_silero_vad
 import numpy as np
 import onnxruntime
+import logging
+from datetime import timedelta
+import gc
+from pydub import AudioSegment
 class OnnxWrapper():
     def __init__(self, path, force_onnx_cpu=False):
+class FixedVADIterator(VADIteratorOnnx):
+    '''It fixes VADIterator by allowing to process any audio length, not only exactly 512 frames at once.
+    If audio to be processed at once is long and multiple voiced segments detected,
+    then __call__ returns the start of the first segment, and end (or middle, which means no end) of the last segment.
+    '''
+    def reset_states(self):
+        super().reset_states()
+        self.buffer = np.array([],dtype=np.float32)
+    def __call__(self, x, return_seconds=False):
+        self.buffer = np.append(self.buffer, x)
+        ret = None
+        while len(self.buffer) >= 512:
+            r = super().__call__(self.buffer[:512], return_seconds=return_seconds)
+            self.buffer = self.buffer[512:]
+            if ret is None:
+                ret = r
+            elif r is not None:
+                if 'end' in r:
+                    ret['end'] = r['end']  # the latter end
+                if 'start' in r and 'end' in ret:  # there is an earlier start.
+                    # Remove end, merging this segment with the previous one.
+                    del ret['end']
+        return ret if ret != {} else None
 class VadV2:
     def __init__(self,
                  threshold: float = 0.5,
         return None
+class SileroVADProcessor:
+    """
+    A class for processing audio files using Silero VAD to detect voice activity
+    and extract voice segments from audio files.
+    """
+    def __init__(self,
+                 activate_threshold=0.5,
+                 fusion_threshold=0.3,
+                 min_speech_duration=0.25,
+                 max_speech_duration=20,
+                 min_silence_duration=250,
+                 sample_rate=16000,
+                 ort_providers=None):
+        """
+        Initialize the SileroVADProcessor.
+        Args:
+            activate_threshold (float): Threshold for voice activity detection
+            fusion_threshold (float): Threshold for merging close speech segments (seconds)
+            min_speech_duration (float): Minimum duration of speech to be considered valid (seconds)
+            max_speech_duration (float): Maximum duration of speech (seconds)
+            min_silence_duration (int): Minimum silence duration (ms)
+            sample_rate (int): Sample rate of the audio (8000 or 16000 Hz)
+            ort_providers (list): ONNX Runtime providers for acceleration
+        """
+        # VAD parameters
+        self.activate_threshold = activate_threshold
+        self.fusion_threshold = fusion_threshold
+        self.min_speech_duration = min_speech_duration
+        self.max_speech_duration = max_speech_duration
+        self.min_silence_duration = min_silence_duration
+        self.sample_rate = sample_rate
+        self.ort_providers = ort_providers if ort_providers else []
+        # Initialize logger
+        self.logger = logging.getLogger(__name__)
+        # Load Silero VAD model
+        self._init_onnx_session()
+        self.silero_vad = load_silero_vad(onnx=True)
+    def _init_onnx_session(self):
+        """Initialize ONNX Runtime session with appropriate settings."""
+        session_opts = onnxruntime.SessionOptions()
+        session_opts.log_severity_level = 3
+        session_opts.inter_op_num_threads = 0
+        session_opts.intra_op_num_threads = 0
+        session_opts.enable_cpu_mem_arena = True
+        session_opts.execution_mode = onnxruntime.ExecutionMode.ORT_SEQUENTIAL
+        session_opts.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
+        session_opts.add_session_config_entry("session.intra_op.allow_spinning", "1")
+        session_opts.add_session_config_entry("session.inter_op.allow_spinning", "1")
+        session_opts.add_session_config_entry("session.set_denormal_as_zero", "1")
+        # Set the session_opts to be used by silero_vad
+        # onnxruntime.capi._pybind_state.get_default_session_options(session_opts)
+    def load_audio(self, audio_path):
+        """
+        Load audio file and prepare it for VAD processing.
+        Args:
+            audio_path (str): Path to the audio file
+        Returns:
+            numpy.ndarray: Audio data as numpy array
+        """
+        self.logger.info(f"Loading audio from {audio_path}")
+        audio_segment = AudioSegment.from_file(audio_path)
+        audio_segment = audio_segment.set_channels(1).set_frame_rate(self.sample_rate)
+        # Convert to numpy array and normalize
+        dtype = np.float16 if self.use_gpu_fp16 else np.float32
+        audio_array = np.array(audio_segment.get_array_of_samples(), dtype=dtype) * 0.000030517578  # 1/32768
+        self.audio_segment = audio_segment  # Store for later use
+        return audio_array
+    @property
+    def model(self):
+        return self.silero_vad
+    def process_timestamps(self, timestamps):
+        """
+        Process VAD timestamps: filter short segments and merge close segments.
+        Args:
+            timestamps (list): List of (start, end) tuples
+        Returns:
+            list: Processed list of (start, end) tuples
+        """
+        # Filter out short durations
+        filtered_timestamps = [(start, end) for start, end in timestamps
+                               if (end - start) >= self.min_speech_duration]
+        # Fuse timestamps in two passes for better merging
+        fused_timestamps_1st = []
+        for start, end in filtered_timestamps:
+            if fused_timestamps_1st and (start - fused_timestamps_1st[-1][1] <= self.fusion_threshold):
+                fused_timestamps_1st[-1] = (fused_timestamps_1st[-1][0], end)
+            else:
+                fused_timestamps_1st.append((start, end))
+        fused_timestamps_2nd = []
+        for start, end in fused_timestamps_1st:
+            if fused_timestamps_2nd and (start - fused_timestamps_2nd[-1][1] <= self.fusion_threshold):
+                fused_timestamps_2nd[-1] = (fused_timestamps_2nd[-1][0], end)
+            else:
+                fused_timestamps_2nd.append((start, end))
+        return fused_timestamps_2nd
+    def format_time(self, seconds):
+        """
+        Convert seconds to VTT time format 'hh:mm:ss.mmm'.
+        Args:
+            seconds (float): Time in seconds
+        Returns:
+            str: Formatted time string
+        """
+        td = timedelta(seconds=seconds)
+        td_sec = td.total_seconds()
+        total_seconds = int(td_sec)
+        milliseconds = int((td_sec - total_seconds) * 1000)
+        hours = total_seconds // 3600
+        minutes = (total_seconds % 3600) // 60
+        seconds = total_seconds % 60
+        return f"{hours:02}:{minutes:02}:{seconds:02}.{milliseconds:03}"
+    def detect_speech(self, audio:np.array):
+        """
+        Run VAD on the audio file to detect speech segments.
+        Args:
+            audio_path (str): Path to the audio file
+        Returns:
+            list: List of processed timestamps as (start, end) tuples
+        """
+        self.logger.info("Starting VAD process")
+        start_time = time.time()
+        # Get speech timestamps
+        raw_timestamps = get_speech_timestamps(
+            audio,
+            model=self.silero_vad,
+            threshold=self.activate_threshold,
+            max_speech_duration_s=self.max_speech_duration,
+            min_speech_duration_ms=int(self.min_speech_duration * 1000),
+            min_silence_duration_ms=self.min_silence_duration,
+            return_seconds=True
+        )
+        # Convert to simple format and process
+        timestamps = [(item['start'], item['end']) for item in raw_timestamps]
+        processed_timestamps = self.process_timestamps(timestamps)
+        # Clean up
+        del audio
+        gc.collect()
+        self.logger.info(f"VAD completed in {time.time() - start_time:.3f} seconds")
+        return processed_timestamps
+        """
+        Save timestamps in both second and sample indices formats.
+        Args:
+            timestamps (list): List of (start, end) tuples
+            output_prefix (str): Prefix for output files
+        """
+        # Save timestamps in seconds (VTT format)
+        seconds_path = f"{output_prefix}_timestamps_second.txt"
+        with open(seconds_path, "w", encoding='UTF-8') as file:
+            self.logger.info("Saving timestamps in seconds format")
+            for start, end in timestamps:
+                s_time = self.format_time(start)
+                e_time = self.format_time(end)
+                line = f"{s_time} --> {e_time}\n"
+                file.write(line)
+        # Save timestamps in sample indices
+        indices_path = f"{output_prefix}_timestamps_indices.txt"
+        with open(indices_path, "w", encoding='UTF-8') as file:
+            self.logger.info("Saving timestamps in indices format")
+            for start, end in timestamps:
+                line = f"{int(start * self.sample_rate)} --> {int(end * self.sample_rate)}\n"
+                file.write(line)
+        self.logger.info(f"Timestamps saved to {seconds_path} and {indices_path}")
+    def extract_speech_segments(self, audio_segment, timestamps):
+        """
+        Extract speech segments from the audio and combine them into a single audio file.
+        Args:
+            timestamps (list): List of (start, end) tuples indicating speech segments
+        Returns:
+            AudioSegment: The combined speech segments
+        """
+        audio_segment = audio_segment.numpy()
+        combined_speech = np.array([], dtype=np.float32)
+        # Extract and combine each speech segment
+        for i, (start, end) in enumerate(timestamps):
+            # Convert seconds to milliseconds for pydub
+            start_ms = int(start * 1000)
+            end_ms = int(end * 1000)
+            # Ensure the end time does not exceed the length of the audio segment
+            if end_ms > len(audio_segment):
+                end_ms = len(audio_segment)
+            # Extract the segment
+            segment = audio_segment[start_ms:end_ms]
+            # Add to combined audio
+            combined_speech = np.append(combined_speech, segment)
+        return combined_speech
+    def process_audio(self, audio_array:np.array):
+        """
+        Complete processing pipeline: detect speech, save timestamps, and optionally extract speech.
+        Returns:
+            tuple: (timestamps, output_speech_path if extract_speech else None)
+        """
+        # Run VAD to detect speech
+        timestamps = self.detect_speech(audio_array)
+        combined_speech = self.extract_speech_segments(audio_array, timestamps)
+        return timestamps, combined_speech
 class VadProcessor:
     def __init__(

transcribe/pipelines/base.py CHANGED Viewed

@@ -22,6 +22,7 @@ class MetaItem:
     translate_content: str = ''
     source_language: str = 'zh'
     destination_language: str = 'en'
 class BasePipe(Process):

     translate_content: str = ''
     source_language: str = 'zh'
     destination_language: str = 'en'
+    speech_status: str = 'END' # "END", "START"
 class BasePipe(Process):

transcribe/pipelines/pipe_vad.py CHANGED Viewed

@@ -1,6 +1,7 @@
 from .base import MetaItem, BasePipe
-from ..helpers.vadprocessor import VadV2
 import numpy as np
 from silero_vad import get_speech_timestamps
 from typing import List
@@ -12,30 +13,83 @@ import logging
 class VadPipe(BasePipe):
     vac = None
     sample_rate = 16000
-    window_size_samples = 512
-    chunk_size = 512
-    prob_threshold=0.5,
-    silence_s=0.5,
-    cache_s=0.25,
     @classmethod
     def init(cls):
         if cls.vac is None:
-            cls.vac = VadV2(cls.prob_threshold, cls.sample_rate, cls.silence_s * 1000, cls.cache_s * 1000, max_speech_duration_s=15)
     def process(self, in_data: MetaItem) -> MetaItem:
-        audio_buffer = np.frombuffer(in_data.source_audio)
-        vad_audio = self.vac(audio_buffer)
-        if vad_audio:
-            in_data.audio = vad_audio['audio']
         else:
-            in_data.audio = b""
-        return in_data
-    # def reduce_noise(self, data):
-    #     return nr.reduce_noise(y=data, sr=self.sample_rate)

 from .base import MetaItem, BasePipe
+from ..helpers.vadprocessor import FixedVADIterator, SileroVADProcessor
 import numpy as np
 from silero_vad import get_speech_timestamps
 from typing import List
 class VadPipe(BasePipe):
     vac = None
     sample_rate = 16000
+    def __init__(self, in_queue=None, out_queue=None) -> None:
+        super().__init__(in_queue, out_queue)
+        self._offset = 0 # 处理的frame size offset
+        self._status = 'END'
+    def reset(self):
+        self._offset = 0
+        self._status = 'END'
     @classmethod
     def init(cls):
         if cls.vac is None:
+            cls.vac = FixedVADIterator(
+                threshold=0.3,
+                sampling_rate=cls.sample_rate,
+                # speech_pad_ms=10
+                min_silence_duration_ms = 100,
+                # speech_pad_ms = 30,
+                max_speech_duration_s=15
+                )
+            cls.vac.reset_states()
+    # def reduce_noise(self, data):
+    #     return nr.reduce_noise(y=data, sr=self.sample_rate)
+    def _process_speech_chunk(self, source_audio:np.ndarray):
+        speech_dict = self.vac(source_audio, return_seconds=False)
+        if speech_dict:
+            relative_start_frame = None
+            relative_end_frame = None
+            start_frame, end_frame = speech_dict.get("start"), speech_dict.get("end")
+            if start_frame:
+                relative_start_frame = start_frame - self._offset
+            if end_frame:
+                relative_end_frame = end_frame - self._offset
+            return relative_start_frame, relative_end_frame
     def process(self, in_data: MetaItem) -> MetaItem:
+        if self._offset == 0:
+            self.vac.reset_states()
+        # silence_audio_100ms = np.zeros(int(0.1*self.sample_rate))
+        source_audio = np.frombuffer(in_data.source_audio, dtype=np.float32)
+        speech_data  = self._process_speech_chunk(source_audio)
+        if speech_data: # 表示有音频的变化点出现
+            rel_start_frame, rel_end_frame = speech_data
+            if rel_start_frame and not rel_end_frame:
+                self._status = "START" # 语音开始
+                target_audio = source_audio[rel_start_frame:]
+                logging.debug("🫸 Speech start frame: {}".format(rel_start_frame))
+            elif not rel_start_frame and rel_end_frame:
+                self._status = "END" # 音频结束
+                target_audio = source_audio[:rel_end_frame]
+                logging.debug(" 🫷Speech ended, capturing audio up to frame: {}".format(rel_end_frame))
+            elif rel_start_frame and rel_end_frame:
+                self._status = 'END'
+                target_audio = source_audio[rel_start_frame:rel_end_frame]
+                logging.debug(" 🔄 Speech segment captured from frame {} to frame {}".format(rel_start_frame, rel_end_frame))
+            else:
+                self._status = 'END'
+                target_audio = np.array([],dtype=np.float32)
+                # logging.debug("❌ No valid speech segment detected, setting status to END")
         else:
+            if self._status == 'START':
+                target_audio = source_audio
+                # logging.debug("🔊 Continuing to capture audio as speech is still ongoing")
+            else: # end
+                target_audio = np.array([],dtype=np.float32)
+                # self._status = 'END'
+                # logging.debug("❌ No speech detected, setting status to END")
+        self._offset += len(source_audio)
+        in_data.audio = target_audio.tobytes()
+        in_data.source_audio = b''
+        in_data.speech_status = self._status
+        return in_data

transcribe/translatepipes.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from transcribe.pipelines import WhisperPipe, MetaItem, WhisperChinese, Translate7BPipe, FunASRPipe
 class TranslatePipes:
@@ -9,7 +9,7 @@ class TranslatePipes:
         self._process = []
         # whisper 转录
         self._whisper_pipe_en = self._launch_process(WhisperPipe())
-        self._whisper_pipe_zh = self._launch_process(WhisperChinese())
         self._funasr_pipe = self._launch_process(FunASRPipe())
         # llm 翻译
@@ -17,7 +17,7 @@ class TranslatePipes:
         self._translate_7b_pipe = self._launch_process(Translate7BPipe())
         # vad
-        # self._vad_pipe = self._launch_process(VadPipe())
     # def reset(self):
     #     self._vad_pipe.reset()

+from transcribe.pipelines import WhisperPipe, MetaItem, WhisperChinese, Translate7BPipe, FunASRPipe, VadPipe
 class TranslatePipes:
         self._process = []
         # whisper 转录
         self._whisper_pipe_en = self._launch_process(WhisperPipe())
+        # self._whisper_pipe_zh = self._launch_process(WhisperChinese())
         self._funasr_pipe = self._launch_process(FunASRPipe())
         # llm 翻译
         self._translate_7b_pipe = self._launch_process(Translate7BPipe())
         # vad
+        self._vad_pipe = self._launch_process(VadPipe())
     # def reset(self):
     #     self._vad_pipe.reset()

transcribe/whisper_llm_serve.py CHANGED Viewed

@@ -8,14 +8,14 @@ from typing import List, Optional, Iterator, Tuple, Any
 import asyncio
 import numpy as np
 import config
 from api_model import TransResult, Message, DebugResult
 from .utils import log_block, save_to_wave, TestDataWriter
 from .translatepipes import TranslatePipes
 from .strategy import (
     TranscriptStabilityAnalyzer, TranscriptToken)
-from transcribe.helpers.vadprocessor import VadProcessor
 from transcribe.pipelines import MetaItem
 logger = getLogger("TranscriptionService")
@@ -43,13 +43,18 @@ class WhisperTranscriptionService:
         self.sample_rate = 16000
         self.lock = threading.Lock()
-        self._frame_queue = queue.Queue()
-        self._vad_frame_queue = queue.Queue()
         # 文本分隔符，根据语言设置
         self.text_separator = self._get_text_separator(language)
         self.loop = asyncio.get_event_loop()
         # 发送就绪状态
         self._transcrible_analysis = None
         # 启动处理线程
@@ -58,10 +63,10 @@ class WhisperTranscriptionService:
         self.translate_thread = self._start_thread(self._transcription_processing_loop)
         self.frame_processing_thread = self._start_thread(self._frame_processing_loop)
-        if language == "zh":
-            self._vad = VadProcessor(prob_threshold=0.8, silence_s=0.2, cache_s=0.15)
-        else:
-            self._vad = VadProcessor(prob_threshold=0.7, silence_s=0.2, cache_s=0.15)
         self.row_number = 0
         # for test
         self._transcrible_time_cost = 0.
@@ -111,24 +116,94 @@ class WhisperTranscriptionService:
         """添加音频帧到处理队列"""
         self._frame_queue.put(frame_np)
     def _frame_processing_loop(self) -> None:
         """从队列获取音频帧并合并到缓冲区"""
         while not self._frame_processing_thread_stop.is_set():
             try:
-                audio = self._frame_queue.get(timeout=0.1)
-                # save_to_wave(f"{self._c}_before_vad.wav", audio)
-                processed_audio = self._vad.process_audio(audio)
-                if processed_audio.shape[0] > 0:
-                    # vad_processed_audio = processed_audio
-                    # save_to_wave(f"{self._c}_after_vad.wav", processed_audio)
-                    # vad_frame_obj = np.frombuffer(processed_audio.audio, dtype=np.float32)
-                    logger.debug(f"Vad frame: {processed_audio.shape[0]/self.sample_rate:.2f}")
-                # apply vad speech check:
-                    self._vad_frame_queue.put(processed_audio)
             except queue.Empty:
                 pass
     def _transcribe_audio(self, audio_buffer: np.ndarray)->MetaItem:
         """转录音频并返回转录片段"""
         log_block("Audio buffer length", f"{audio_buffer.shape[0]/self.sample_rate:.2f}", "s")
@@ -176,47 +251,7 @@ class WhisperTranscriptionService:
         self._translate_time_cost = round(time_diff, 3)
         return translated_text
-    def _transcription_processing_loop(self) -> None:
-        """主转录处理循环"""
-        while not self._translate_thread_stop.is_set():
-            audio_buffer = self._vad_frame_queue.get()
-            if audio_buffer is None:
-                time.sleep(0.2)
-                continue
-            if len(audio_buffer) < int(self.sample_rate):
-                silence_audio = np.zeros(self.sample_rate, dtype=np.float32)
-                silence_audio[-len(audio_buffer):] = audio_buffer
-                audio_buffer = silence_audio
-            logger.debug(f"audio buffer size: {len(audio_buffer) / self.sample_rate:.2f}s")
-            # try:
-            meta_item = self._transcribe_audio(audio_buffer)
-            segments = meta_item.segments
-            logger.debug(f"Segments: {segments}")
-            if len(segments):
-                result = self._process_transcription_results_2(segments)
-                self._send_result_to_client(result)
-                time.sleep(0.1)
-            # 处理转录结果并发送到客户端
-            # for result in self._process_transcription_results(segments, audio_buffer):
-            #     self._send_result_to_client(result)
-            # except Exception as e:
-            #     logger.error(f"Error processing audio: {e}")
-    def _process_transcription_results_2(self, segments: List[TranscriptToken],):
-        seg_text = self.text_separator.join(seg.text for seg in segments)
-        item =  TransResult(
-                seg_id=self.row_number,
-                context=seg_text,
-                from_=self.source_language,
-                to=self.target_language,
-                tran_content=self._translate_text_large(seg_text),
-                partial=False
-            )
-        self.row_number += 1
-        return item
     def _process_transcription_results(self, segments: List[TranscriptToken], audio_buffer: np.ndarray) -> Iterator[TransResult]:
         """

 import asyncio
 import numpy as np
 import config
+import collections
 from api_model import TransResult, Message, DebugResult
 from .utils import log_block, save_to_wave, TestDataWriter
 from .translatepipes import TranslatePipes
 from .strategy import (
     TranscriptStabilityAnalyzer, TranscriptToken)
+# from transcribe.helpers.vadprocessor import VadProcessor
 from transcribe.pipelines import MetaItem
 logger = getLogger("TranscriptionService")
         self.sample_rate = 16000
         self.lock = threading.Lock()
         # 文本分隔符，根据语言设置
         self.text_separator = self._get_text_separator(language)
         self.loop = asyncio.get_event_loop()
         # 发送就绪状态
+        #  原始音频队列
+        self._frame_queue = queue.Queue()
+        #  音频队列缓冲区
+        self.frames_np = None
+        #  完整音频队列
+        self.segments_queue = collections.deque()
         self._transcrible_analysis = None
         # 启动处理线程
         self.translate_thread = self._start_thread(self._transcription_processing_loop)
         self.frame_processing_thread = self._start_thread(self._frame_processing_loop)
+        # if language == "zh":
+        #     self._vad = VadProcessor(prob_threshold=0.8, silence_s=0.2, cache_s=0.15)
+        # else:
+        #     self._vad = VadProcessor(prob_threshold=0.7, silence_s=0.2, cache_s=0.15)
         self.row_number = 0
         # for test
         self._transcrible_time_cost = 0.
         """添加音频帧到处理队列"""
         self._frame_queue.put(frame_np)
+    def _apply_voice_activity_detection(self, frame_np:np.array):
+        """应用语音活动检测来优化音频缓冲区"""
+        processed_audio = self._translate_pipe.voice_detect(frame_np.tobytes())
+        speech_audio =  np.frombuffer(processed_audio.audio, dtype=np.float32)
+        speech_status = processed_audio.speech_status
+        return speech_audio, speech_status
     def _frame_processing_loop(self) -> None:
         """从队列获取音频帧并合并到缓冲区"""
         while not self._frame_processing_thread_stop.is_set():
             try:
+                frame_np = self._frame_queue.get(timeout=0.1)
+                frame_np, speech_status = self._apply_voice_activity_detection(frame_np)
+                if frame_np is None:
+                    continue
+                with self.lock:
+                    if self.frames_np is None:
+                        self.frames_np = frame_np.copy()
+                    else:
+                        self.frames_np = np.append(self.frames_np, frame_np)
+                    if speech_status == "END" and len(self.frames_np) > 0:
+                        self.segments_queue.appendleft(self.frames_np.copy())
+                        self.frames_np = np.array([], dtype=np.float32)
             except queue.Empty:
                 pass
+    def _process_transcription_results_2(self, segments: List[TranscriptToken],partial):
+        seg_text = self.text_separator.join(seg.text for seg in segments)
+        item =  TransResult(
+                seg_id=self.row_number,
+                context=seg_text,
+                from_=self.source_language,
+                to=self.target_language,
+                tran_content=self._translate_text_large(seg_text),
+                partial=partial
+            )
+        if partial == False:
+            self.row_number += 1
+        return item
+    def _transcription_processing_loop(self) -> None:
+        """主转录处理循环"""
+        frame_epoch = 1
+        while not self._translate_thread_stop.is_set():
+            if self.frames_np is None:
+                time.sleep(0.2)
+                continue
+            with self.lock:
+                if len(self.segments_queue) >0:
+                    audio_buffer = self.segments_queue.pop()
+                    partial = False
+                else:
+                    audio_buffer = self.frames_np[:int(frame_epoch * 1.5 * self.sample_rate)]# 获取 1.5s * epoch 个音频长度
+                    partial = True
+            if len(audio_buffer) ==0:
+                time.sleep(0.2)
+                continue
+            if len(audio_buffer) < int(self.sample_rate):
+                silence_audio = np.zeros(self.sample_rate, dtype=np.float32)
+                silence_audio[-len(audio_buffer):] = audio_buffer
+                audio_buffer = silence_audio
+            logger.debug(f"audio buffer size: {len(audio_buffer) / self.sample_rate:.2f}s")
+            # try:
+            meta_item = self._transcribe_audio(audio_buffer)
+            segments = meta_item.segments
+            logger.debug(f"Segments: {segments}")
+            if len(segments):
+                result = self._process_transcription_results_2(segments, partial)
+                self._send_result_to_client(result)
+                time.sleep(0.1)
+            if partial == False:
+                frame_epoch = 1
+            else:
+                frame_epoch += 1
+            # 处理转录结果并发送到客户端
+            # for result in self._process_transcription_results(segments, audio_buffer):
+            #     self._send_result_to_client(result)
+            # except Exception as e:
+            #     logger.error(f"Error processing audio: {e}")
     def _transcribe_audio(self, audio_buffer: np.ndarray)->MetaItem:
         """转录音频并返回转录片段"""
         log_block("Audio buffer length", f"{audio_buffer.shape[0]/self.sample_rate:.2f}", "s")
         self._translate_time_cost = round(time_diff, 3)
         return translated_text
     def _process_transcription_results(self, segments: List[TranscriptToken], audio_buffer: np.ndarray) -> Iterator[TransResult]:
         """