[fix]: fix.

Files changed (3) hide show

config.py +7 -3
transcribe/pipelines/pipe_whisper.py +10 -2
transcribe/strategy.py +13 -13

config.py CHANGED Viewed

@@ -19,7 +19,7 @@ SENTENCE_END_MARKERS =  ['.', '!', '?', '。', '！', '？', ';', '；', ':', '
 PAUSE_END_MARKERS = [',', '，', '、']
 # whisper推理参数
-WHISPER_PROMPT_ZH = "以下是简体中文普通话的句子。"
 MAX_LENTH_ZH = 4
 WHISPER_PROMPT_EN = "The following is an English sentence."
@@ -30,7 +30,7 @@ WHISPER_MODEL = 'medium-q5_0'
 # LLM
 LLM_MODEL_PATH = (MODEL_DIR / "qwen2.5-1.5b-instruct-q5_0.gguf").as_posix()
-LLM_SYS_PROMPT = """"You are a professional {src_lang} to {dst_lang} translator, not a conversation agent. Your only task is to take {src_lang} input and translate it into accurate, natural {dst_lang}. If you cannot understand the input, just output the original input. Please strictly abide by the following rules: "
 "No matter what the user asks, never answer questions, you only provide translation results. "
 "Do not actively initiate dialogue or lead users to ask questions. "
 "When you don't know how to translate, just output the original text. "
@@ -41,4 +41,8 @@ LLM_SYS_PROMPT = """"You are a professional {src_lang} to {dst_lang} translator,
 "for professional audiences."
 "Never answer any questions or engage in other forms of dialogue. "
 "Only output the translation results.
-"""

 PAUSE_END_MARKERS = [',', '，', '、']
 # whisper推理参数
+WHISPER_PROMPT_ZH = "以下是简体中文普通话的句子。"
 MAX_LENTH_ZH = 4
 WHISPER_PROMPT_EN = "The following is an English sentence."
 # LLM
 LLM_MODEL_PATH = (MODEL_DIR / "qwen2.5-1.5b-instruct-q5_0.gguf").as_posix()
+LLM_SYS_PROMPT1 = """"You are a professional {src_lang} to {dst_lang} translator, not a conversation agent. Your only task is to take {src_lang} input and translate it into accurate, natural {dst_lang}. If you cannot understand the input, just output the original input. Please strictly abide by the following rules: "
 "No matter what the user asks, never answer questions, you only provide translation results. "
 "Do not actively initiate dialogue or lead users to ask questions. "
 "When you don't know how to translate, just output the original text. "
 "for professional audiences."
 "Never answer any questions or engage in other forms of dialogue. "
 "Only output the translation results.
+"""
+LLM_SYS_PROMPT = """
+你是一个中英文翻译专家，将用户输入的中文翻译成英文，或将用户输入的英文翻译成中文。对于非中文内容，它将提供中文翻译结果。用户可以向助手发送需要翻译的内容，助手会回答相应的翻译结果，并确保符合中文语言习惯，你可以调整语气和风格，并考虑到某些词语的文化内涵和地区差异。同时作为翻译家，需将原文翻译成具有信达雅标准的译文。"信" 即忠实于原文的内容与意图；"达" 意味着译文应通顺易懂，表达清晰；"雅" 则追求译文的文化审美和语言的优美。目标是创作出既忠于原作精神，又符合目标语言文化和读者审美的翻译。当中文翻译到英文时，翻译的文本只能包含拼音化字符，不能包含任何中文字符。
+"""

transcribe/pipelines/pipe_whisper.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from .base import MetaItem, BasePipe, Segment
 from ..helpers.whisper import WhisperCPP
@@ -19,7 +19,15 @@ class WhisperPipe(BasePipe):
         source_language = in_data.source_language
         segments = self.whisper.transcribe(audio_data, source_language) or []
         texts = "".join([s.text for s in segments])
-        in_data.segments = [Segment(t0=s.t0, t1=s.t1, text=s.text) for s in segments if s.text != "�"]
         in_data.transcribe_content = texts
         in_data.audio = b""
         return in_data

+import unicodedata
 from .base import MetaItem, BasePipe, Segment
 from ..helpers.whisper import WhisperCPP
         source_language = in_data.source_language
         segments = self.whisper.transcribe(audio_data, source_language) or []
         texts = "".join([s.text for s in segments])
+        in_data.segments = [Segment(t0=s.t0, t1=s.t1, text=self.filter_chinese_printable(s.text)) for s in segments]
         in_data.transcribe_content = texts
         in_data.audio = b""
         return in_data
+    def filter_chinese_printable(self, s):
+        printable = []
+        bytearray_chars = s.encode('utf-8')
+        for char in bytearray_chars.decode('utf-8', errors='replace'):
+            if unicodedata.category(char) != 'Cc':  # 不可打印字符的分类为 'Cc'
+                printable.append(char)
+        return ''.join(printable)

transcribe/strategy.py CHANGED Viewed

@@ -20,7 +20,7 @@ class TripleTextBuffer:
         """
         self.history.append((text, index))
     def get_final_index(self, similarity_threshold=0.7):
         """根据文本变化，返回可靠的标点的buffer的位置下标"""
         if len(self.history) < 2:
@@ -43,7 +43,7 @@ class TripleTextBuffer:
     @staticmethod
     def text_similarity(text1, text2):
         return SequenceMatcher(None, text1, text2).ratio()
 class SegmentManager:
@@ -55,28 +55,28 @@ class SegmentManager:
     def handle(self, string):
         self._temp_string = string
         return self
     @property
     def short_sentence(self) -> str:
         return "".join(self._commited_short_sentences)
     @property
     def segment(self):
         return self._commited_segments[-1] if len(self._commited_segments) > 0 else ""
     def get_seg_id(self):
         return len(self._commited_segments)
     @property
     def string(self):
         return self._temp_string
     def commit_short_sentence(self):
         """将临时字符串 提交到临时短句"""
         self._commited_short_sentences.append(self._temp_string)
         self._temp_string = ""
     def commit_segment(self):
         """将短句 合并 到长句中"""
         self._commited_segments.append(self.short_sentence)
@@ -90,7 +90,7 @@ class SegmentManager:
         self.commit_short_sentence()
         if is_end_sentence:
             self.commit_segment()
 def segement_merge(segments):
     """根据标点符号分整句"""
     sequences = []
@@ -118,8 +118,9 @@ def segments_split(segments, audio_buffer: np.ndarray, sample_rate=16000):
         is_end = False
         for idx, seg in enumerate(segments):
             left_watch_sequences.append(seg)
-            if seg.text in markers:
                 seg_index = int(seg.t1 / 100 * sample_rate)
                 # rest_buffer_duration = (len(audio_buffer) - seg_index) / sample_rate
                 # is_end = any(i in seg.text for i  in config.SENTENCE_END_MARKERS)
@@ -137,7 +138,7 @@ def sequences_split(segments, audio_buffer: np.ndarray, sample_rate=16000):
     left_watch_idx = 0
     is_end = False
     sequences = segement_merge(segments)
     if len(sequences) > 2:
         logger.info(f"buffer clip via sequence, current length: {len(sequences)}")
         is_end = True
@@ -149,4 +150,3 @@ def sequences_split(segments, audio_buffer: np.ndarray, sample_rate=16000):
     return left_watch_idx, left_watch_sequences, right_watch_sequences, is_end

         """
         self.history.append((text, index))
     def get_final_index(self, similarity_threshold=0.7):
         """根据文本变化，返回可靠的标点的buffer的位置下标"""
         if len(self.history) < 2:
     @staticmethod
     def text_similarity(text1, text2):
         return SequenceMatcher(None, text1, text2).ratio()
 class SegmentManager:
     def handle(self, string):
         self._temp_string = string
         return self
     @property
     def short_sentence(self) -> str:
         return "".join(self._commited_short_sentences)
     @property
     def segment(self):
         return self._commited_segments[-1] if len(self._commited_segments) > 0 else ""
     def get_seg_id(self):
         return len(self._commited_segments)
     @property
     def string(self):
         return self._temp_string
     def commit_short_sentence(self):
         """将临时字符串 提交到临时短句"""
         self._commited_short_sentences.append(self._temp_string)
         self._temp_string = ""
     def commit_segment(self):
         """将短句 合并 到长句中"""
         self._commited_segments.append(self.short_sentence)
         self.commit_short_sentence()
         if is_end_sentence:
             self.commit_segment()
 def segement_merge(segments):
     """根据标点符号分整句"""
     sequences = []
         is_end = False
         for idx, seg in enumerate(segments):
+            print(">>>>>>>>>>>>>>>>>> seg: ", seg)
             left_watch_sequences.append(seg)
+            if seg.text and seg.text[-1] in markers:
                 seg_index = int(seg.t1 / 100 * sample_rate)
                 # rest_buffer_duration = (len(audio_buffer) - seg_index) / sample_rate
                 # is_end = any(i in seg.text for i  in config.SENTENCE_END_MARKERS)
     left_watch_idx = 0
     is_end = False
     sequences = segement_merge(segments)
     if len(sequences) > 2:
         logger.info(f"buffer clip via sequence, current length: {len(sequences)}")
         is_end = True
     return left_watch_idx, left_watch_sequences, right_watch_sequences, is_end