Xin Zhang
commited on
Commit
·
9f6a51c
1
Parent(s):
fca9809
[fix]: fix.
Browse files- config.py +7 -3
- transcribe/pipelines/pipe_whisper.py +10 -2
- transcribe/strategy.py +13 -13
config.py
CHANGED
|
@@ -19,7 +19,7 @@ SENTENCE_END_MARKERS = ['.', '!', '?', '。', '!', '?', ';', ';', ':', '
|
|
| 19 |
PAUSE_END_MARKERS = [',', ',', '、']
|
| 20 |
|
| 21 |
# whisper推理参数
|
| 22 |
-
WHISPER_PROMPT_ZH = "以下是简体中文普通话的句子。"
|
| 23 |
MAX_LENTH_ZH = 4
|
| 24 |
|
| 25 |
WHISPER_PROMPT_EN = "The following is an English sentence."
|
|
@@ -30,7 +30,7 @@ WHISPER_MODEL = 'medium-q5_0'
|
|
| 30 |
# LLM
|
| 31 |
LLM_MODEL_PATH = (MODEL_DIR / "qwen2.5-1.5b-instruct-q5_0.gguf").as_posix()
|
| 32 |
|
| 33 |
-
|
| 34 |
"No matter what the user asks, never answer questions, you only provide translation results. "
|
| 35 |
"Do not actively initiate dialogue or lead users to ask questions. "
|
| 36 |
"When you don't know how to translate, just output the original text. "
|
|
@@ -41,4 +41,8 @@ LLM_SYS_PROMPT = """"You are a professional {src_lang} to {dst_lang} translator,
|
|
| 41 |
"for professional audiences."
|
| 42 |
"Never answer any questions or engage in other forms of dialogue. "
|
| 43 |
"Only output the translation results.
|
| 44 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
PAUSE_END_MARKERS = [',', ',', '、']
|
| 20 |
|
| 21 |
# whisper推理参数
|
| 22 |
+
WHISPER_PROMPT_ZH = "以下是简体中文普通话的句子。"
|
| 23 |
MAX_LENTH_ZH = 4
|
| 24 |
|
| 25 |
WHISPER_PROMPT_EN = "The following is an English sentence."
|
|
|
|
| 30 |
# LLM
|
| 31 |
LLM_MODEL_PATH = (MODEL_DIR / "qwen2.5-1.5b-instruct-q5_0.gguf").as_posix()
|
| 32 |
|
| 33 |
+
LLM_SYS_PROMPT1 = """"You are a professional {src_lang} to {dst_lang} translator, not a conversation agent. Your only task is to take {src_lang} input and translate it into accurate, natural {dst_lang}. If you cannot understand the input, just output the original input. Please strictly abide by the following rules: "
|
| 34 |
"No matter what the user asks, never answer questions, you only provide translation results. "
|
| 35 |
"Do not actively initiate dialogue or lead users to ask questions. "
|
| 36 |
"When you don't know how to translate, just output the original text. "
|
|
|
|
| 41 |
"for professional audiences."
|
| 42 |
"Never answer any questions or engage in other forms of dialogue. "
|
| 43 |
"Only output the translation results.
|
| 44 |
+
"""
|
| 45 |
+
|
| 46 |
+
LLM_SYS_PROMPT = """
|
| 47 |
+
你是一个中英文翻译专家,将用户输入的中文翻译成英文,或将用户输入的英文翻译成中文。对于非中文内容,它将提供中文翻译结果。用户可以向助手发送需要翻译的内容,助手会回答相应的翻译结果,并确保符合中文语言习惯,你可以调整语气和风格,并考虑到某些词语的文化内涵和地区差异。同时作为翻译家,需将原文翻译成具有信达雅标准的译文。"信" 即忠实于原文的内容与意图;"达" 意味着译文应通顺易懂,表达清晰;"雅" 则追求译文的文化审美和语言的优美。目标是创作出既忠于原作精神,又符合目标语言文化和读者审美的翻译。当中文翻译到英文时,翻译的文本只能包含拼音化字符,不能包含任何中文字符。
|
| 48 |
+
"""
|
transcribe/pipelines/pipe_whisper.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
|
| 2 |
-
|
| 3 |
from .base import MetaItem, BasePipe, Segment
|
| 4 |
from ..helpers.whisper import WhisperCPP
|
| 5 |
|
|
@@ -19,7 +19,15 @@ class WhisperPipe(BasePipe):
|
|
| 19 |
source_language = in_data.source_language
|
| 20 |
segments = self.whisper.transcribe(audio_data, source_language) or []
|
| 21 |
texts = "".join([s.text for s in segments])
|
| 22 |
-
in_data.segments = [Segment(t0=s.t0, t1=s.t1, text=s.text) for s in segments
|
| 23 |
in_data.transcribe_content = texts
|
| 24 |
in_data.audio = b""
|
| 25 |
return in_data
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
|
| 2 |
+
import unicodedata
|
| 3 |
from .base import MetaItem, BasePipe, Segment
|
| 4 |
from ..helpers.whisper import WhisperCPP
|
| 5 |
|
|
|
|
| 19 |
source_language = in_data.source_language
|
| 20 |
segments = self.whisper.transcribe(audio_data, source_language) or []
|
| 21 |
texts = "".join([s.text for s in segments])
|
| 22 |
+
in_data.segments = [Segment(t0=s.t0, t1=s.t1, text=self.filter_chinese_printable(s.text)) for s in segments]
|
| 23 |
in_data.transcribe_content = texts
|
| 24 |
in_data.audio = b""
|
| 25 |
return in_data
|
| 26 |
+
|
| 27 |
+
def filter_chinese_printable(self, s):
|
| 28 |
+
printable = []
|
| 29 |
+
bytearray_chars = s.encode('utf-8')
|
| 30 |
+
for char in bytearray_chars.decode('utf-8', errors='replace'):
|
| 31 |
+
if unicodedata.category(char) != 'Cc': # 不可打印字符的分类为 'Cc'
|
| 32 |
+
printable.append(char)
|
| 33 |
+
return ''.join(printable)
|
transcribe/strategy.py
CHANGED
|
@@ -20,7 +20,7 @@ class TripleTextBuffer:
|
|
| 20 |
"""
|
| 21 |
self.history.append((text, index))
|
| 22 |
|
| 23 |
-
|
| 24 |
def get_final_index(self, similarity_threshold=0.7):
|
| 25 |
"""根据文本变化,返回可靠的标点的buffer的位置下标"""
|
| 26 |
if len(self.history) < 2:
|
|
@@ -43,7 +43,7 @@ class TripleTextBuffer:
|
|
| 43 |
@staticmethod
|
| 44 |
def text_similarity(text1, text2):
|
| 45 |
return SequenceMatcher(None, text1, text2).ratio()
|
| 46 |
-
|
| 47 |
|
| 48 |
|
| 49 |
class SegmentManager:
|
|
@@ -55,28 +55,28 @@ class SegmentManager:
|
|
| 55 |
def handle(self, string):
|
| 56 |
self._temp_string = string
|
| 57 |
return self
|
| 58 |
-
|
| 59 |
@property
|
| 60 |
def short_sentence(self) -> str:
|
| 61 |
return "".join(self._commited_short_sentences)
|
| 62 |
-
|
| 63 |
@property
|
| 64 |
def segment(self):
|
| 65 |
return self._commited_segments[-1] if len(self._commited_segments) > 0 else ""
|
| 66 |
-
|
| 67 |
def get_seg_id(self):
|
| 68 |
return len(self._commited_segments)
|
| 69 |
-
|
| 70 |
@property
|
| 71 |
def string(self):
|
| 72 |
return self._temp_string
|
| 73 |
-
|
| 74 |
-
|
| 75 |
def commit_short_sentence(self):
|
| 76 |
"""将临时字符串 提交到临时短句"""
|
| 77 |
self._commited_short_sentences.append(self._temp_string)
|
| 78 |
self._temp_string = ""
|
| 79 |
-
|
| 80 |
def commit_segment(self):
|
| 81 |
"""将短句 合并 到长句中"""
|
| 82 |
self._commited_segments.append(self.short_sentence)
|
|
@@ -90,7 +90,7 @@ class SegmentManager:
|
|
| 90 |
self.commit_short_sentence()
|
| 91 |
if is_end_sentence:
|
| 92 |
self.commit_segment()
|
| 93 |
-
|
| 94 |
def segement_merge(segments):
|
| 95 |
"""根据标点符号分整句"""
|
| 96 |
sequences = []
|
|
@@ -118,8 +118,9 @@ def segments_split(segments, audio_buffer: np.ndarray, sample_rate=16000):
|
|
| 118 |
is_end = False
|
| 119 |
|
| 120 |
for idx, seg in enumerate(segments):
|
|
|
|
| 121 |
left_watch_sequences.append(seg)
|
| 122 |
-
if seg.text in markers:
|
| 123 |
seg_index = int(seg.t1 / 100 * sample_rate)
|
| 124 |
# rest_buffer_duration = (len(audio_buffer) - seg_index) / sample_rate
|
| 125 |
# is_end = any(i in seg.text for i in config.SENTENCE_END_MARKERS)
|
|
@@ -137,7 +138,7 @@ def sequences_split(segments, audio_buffer: np.ndarray, sample_rate=16000):
|
|
| 137 |
left_watch_idx = 0
|
| 138 |
is_end = False
|
| 139 |
sequences = segement_merge(segments)
|
| 140 |
-
|
| 141 |
if len(sequences) > 2:
|
| 142 |
logger.info(f"buffer clip via sequence, current length: {len(sequences)}")
|
| 143 |
is_end = True
|
|
@@ -149,4 +150,3 @@ def sequences_split(segments, audio_buffer: np.ndarray, sample_rate=16000):
|
|
| 149 |
return left_watch_idx, left_watch_sequences, right_watch_sequences, is_end
|
| 150 |
|
| 151 |
|
| 152 |
-
|
|
|
|
| 20 |
"""
|
| 21 |
self.history.append((text, index))
|
| 22 |
|
| 23 |
+
|
| 24 |
def get_final_index(self, similarity_threshold=0.7):
|
| 25 |
"""根据文本变化,返回可靠的标点的buffer的位置下标"""
|
| 26 |
if len(self.history) < 2:
|
|
|
|
| 43 |
@staticmethod
|
| 44 |
def text_similarity(text1, text2):
|
| 45 |
return SequenceMatcher(None, text1, text2).ratio()
|
| 46 |
+
|
| 47 |
|
| 48 |
|
| 49 |
class SegmentManager:
|
|
|
|
| 55 |
def handle(self, string):
|
| 56 |
self._temp_string = string
|
| 57 |
return self
|
| 58 |
+
|
| 59 |
@property
|
| 60 |
def short_sentence(self) -> str:
|
| 61 |
return "".join(self._commited_short_sentences)
|
| 62 |
+
|
| 63 |
@property
|
| 64 |
def segment(self):
|
| 65 |
return self._commited_segments[-1] if len(self._commited_segments) > 0 else ""
|
| 66 |
+
|
| 67 |
def get_seg_id(self):
|
| 68 |
return len(self._commited_segments)
|
| 69 |
+
|
| 70 |
@property
|
| 71 |
def string(self):
|
| 72 |
return self._temp_string
|
| 73 |
+
|
| 74 |
+
|
| 75 |
def commit_short_sentence(self):
|
| 76 |
"""将临时字符串 提交到临时短句"""
|
| 77 |
self._commited_short_sentences.append(self._temp_string)
|
| 78 |
self._temp_string = ""
|
| 79 |
+
|
| 80 |
def commit_segment(self):
|
| 81 |
"""将短句 合并 到长句中"""
|
| 82 |
self._commited_segments.append(self.short_sentence)
|
|
|
|
| 90 |
self.commit_short_sentence()
|
| 91 |
if is_end_sentence:
|
| 92 |
self.commit_segment()
|
| 93 |
+
|
| 94 |
def segement_merge(segments):
|
| 95 |
"""根据标点符号分整句"""
|
| 96 |
sequences = []
|
|
|
|
| 118 |
is_end = False
|
| 119 |
|
| 120 |
for idx, seg in enumerate(segments):
|
| 121 |
+
print(">>>>>>>>>>>>>>>>>> seg: ", seg)
|
| 122 |
left_watch_sequences.append(seg)
|
| 123 |
+
if seg.text and seg.text[-1] in markers:
|
| 124 |
seg_index = int(seg.t1 / 100 * sample_rate)
|
| 125 |
# rest_buffer_duration = (len(audio_buffer) - seg_index) / sample_rate
|
| 126 |
# is_end = any(i in seg.text for i in config.SENTENCE_END_MARKERS)
|
|
|
|
| 138 |
left_watch_idx = 0
|
| 139 |
is_end = False
|
| 140 |
sequences = segement_merge(segments)
|
| 141 |
+
|
| 142 |
if len(sequences) > 2:
|
| 143 |
logger.info(f"buffer clip via sequence, current length: {len(sequences)}")
|
| 144 |
is_end = True
|
|
|
|
| 150 |
return left_watch_idx, left_watch_sequences, right_watch_sequences, is_end
|
| 151 |
|
| 152 |
|
|
|