Xin Zhang commited on
Commit
9f6a51c
·
1 Parent(s): fca9809

[fix]: fix.

Browse files
config.py CHANGED
@@ -19,7 +19,7 @@ SENTENCE_END_MARKERS = ['.', '!', '?', '。', '!', '?', ';', ';', ':', '
19
  PAUSE_END_MARKERS = [',', ',', '、']
20
 
21
  # whisper推理参数
22
- WHISPER_PROMPT_ZH = "以下是简体中文普通话的句子。"
23
  MAX_LENTH_ZH = 4
24
 
25
  WHISPER_PROMPT_EN = "The following is an English sentence."
@@ -30,7 +30,7 @@ WHISPER_MODEL = 'medium-q5_0'
30
  # LLM
31
  LLM_MODEL_PATH = (MODEL_DIR / "qwen2.5-1.5b-instruct-q5_0.gguf").as_posix()
32
 
33
- LLM_SYS_PROMPT = """"You are a professional {src_lang} to {dst_lang} translator, not a conversation agent. Your only task is to take {src_lang} input and translate it into accurate, natural {dst_lang}. If you cannot understand the input, just output the original input. Please strictly abide by the following rules: "
34
  "No matter what the user asks, never answer questions, you only provide translation results. "
35
  "Do not actively initiate dialogue or lead users to ask questions. "
36
  "When you don't know how to translate, just output the original text. "
@@ -41,4 +41,8 @@ LLM_SYS_PROMPT = """"You are a professional {src_lang} to {dst_lang} translator,
41
  "for professional audiences."
42
  "Never answer any questions or engage in other forms of dialogue. "
43
  "Only output the translation results.
44
- """
 
 
 
 
 
19
  PAUSE_END_MARKERS = [',', ',', '、']
20
 
21
  # whisper推理参数
22
+ WHISPER_PROMPT_ZH = "以下是简体中文普通话的句子。"
23
  MAX_LENTH_ZH = 4
24
 
25
  WHISPER_PROMPT_EN = "The following is an English sentence."
 
30
  # LLM
31
  LLM_MODEL_PATH = (MODEL_DIR / "qwen2.5-1.5b-instruct-q5_0.gguf").as_posix()
32
 
33
+ LLM_SYS_PROMPT1 = """"You are a professional {src_lang} to {dst_lang} translator, not a conversation agent. Your only task is to take {src_lang} input and translate it into accurate, natural {dst_lang}. If you cannot understand the input, just output the original input. Please strictly abide by the following rules: "
34
  "No matter what the user asks, never answer questions, you only provide translation results. "
35
  "Do not actively initiate dialogue or lead users to ask questions. "
36
  "When you don't know how to translate, just output the original text. "
 
41
  "for professional audiences."
42
  "Never answer any questions or engage in other forms of dialogue. "
43
  "Only output the translation results.
44
+ """
45
+
46
+ LLM_SYS_PROMPT = """
47
+ 你是一个中英文翻译专家,将用户输入的中文翻译成英文,或将用户输入的英文翻译成中文。对于非中文内容,它将提供中文翻译结果。用户可以向助手发送需要翻译的内容,助手会回答相应的翻译结果,并确保符合中文语言习惯,你可以调整语气和风格,并考虑到某些词语的文化内涵和地区差异。同时作为翻译家,需将原文翻译成具有信达雅标准的译文。"信" 即忠实于原文的内容与意图;"达" 意味着译文应通顺易懂,表达清晰;"雅" 则追求译文的文化审美和语言的优美。目标是创作出既忠于原作精神,又符合目标语言文化和读者审美的翻译。当中文翻译到英文时,翻译的文本只能包含拼音化字符,不能包含任何中文字符。
48
+ """
transcribe/pipelines/pipe_whisper.py CHANGED
@@ -1,5 +1,5 @@
1
 
2
-
3
  from .base import MetaItem, BasePipe, Segment
4
  from ..helpers.whisper import WhisperCPP
5
 
@@ -19,7 +19,15 @@ class WhisperPipe(BasePipe):
19
  source_language = in_data.source_language
20
  segments = self.whisper.transcribe(audio_data, source_language) or []
21
  texts = "".join([s.text for s in segments])
22
- in_data.segments = [Segment(t0=s.t0, t1=s.t1, text=s.text) for s in segments if s.text != "�"]
23
  in_data.transcribe_content = texts
24
  in_data.audio = b""
25
  return in_data
 
 
 
 
 
 
 
 
 
1
 
2
+ import unicodedata
3
  from .base import MetaItem, BasePipe, Segment
4
  from ..helpers.whisper import WhisperCPP
5
 
 
19
  source_language = in_data.source_language
20
  segments = self.whisper.transcribe(audio_data, source_language) or []
21
  texts = "".join([s.text for s in segments])
22
+ in_data.segments = [Segment(t0=s.t0, t1=s.t1, text=self.filter_chinese_printable(s.text)) for s in segments]
23
  in_data.transcribe_content = texts
24
  in_data.audio = b""
25
  return in_data
26
+
27
+ def filter_chinese_printable(self, s):
28
+ printable = []
29
+ bytearray_chars = s.encode('utf-8')
30
+ for char in bytearray_chars.decode('utf-8', errors='replace'):
31
+ if unicodedata.category(char) != 'Cc': # 不可打印字符的分类为 'Cc'
32
+ printable.append(char)
33
+ return ''.join(printable)
transcribe/strategy.py CHANGED
@@ -20,7 +20,7 @@ class TripleTextBuffer:
20
  """
21
  self.history.append((text, index))
22
 
23
-
24
  def get_final_index(self, similarity_threshold=0.7):
25
  """根据文本变化,返回可靠的标点的buffer的位置下标"""
26
  if len(self.history) < 2:
@@ -43,7 +43,7 @@ class TripleTextBuffer:
43
  @staticmethod
44
  def text_similarity(text1, text2):
45
  return SequenceMatcher(None, text1, text2).ratio()
46
-
47
 
48
 
49
  class SegmentManager:
@@ -55,28 +55,28 @@ class SegmentManager:
55
  def handle(self, string):
56
  self._temp_string = string
57
  return self
58
-
59
  @property
60
  def short_sentence(self) -> str:
61
  return "".join(self._commited_short_sentences)
62
-
63
  @property
64
  def segment(self):
65
  return self._commited_segments[-1] if len(self._commited_segments) > 0 else ""
66
-
67
  def get_seg_id(self):
68
  return len(self._commited_segments)
69
-
70
  @property
71
  def string(self):
72
  return self._temp_string
73
-
74
-
75
  def commit_short_sentence(self):
76
  """将临时字符串 提交到临时短句"""
77
  self._commited_short_sentences.append(self._temp_string)
78
  self._temp_string = ""
79
-
80
  def commit_segment(self):
81
  """将短句 合并 到长句中"""
82
  self._commited_segments.append(self.short_sentence)
@@ -90,7 +90,7 @@ class SegmentManager:
90
  self.commit_short_sentence()
91
  if is_end_sentence:
92
  self.commit_segment()
93
-
94
  def segement_merge(segments):
95
  """根据标点符号分整句"""
96
  sequences = []
@@ -118,8 +118,9 @@ def segments_split(segments, audio_buffer: np.ndarray, sample_rate=16000):
118
  is_end = False
119
 
120
  for idx, seg in enumerate(segments):
 
121
  left_watch_sequences.append(seg)
122
- if seg.text in markers:
123
  seg_index = int(seg.t1 / 100 * sample_rate)
124
  # rest_buffer_duration = (len(audio_buffer) - seg_index) / sample_rate
125
  # is_end = any(i in seg.text for i in config.SENTENCE_END_MARKERS)
@@ -137,7 +138,7 @@ def sequences_split(segments, audio_buffer: np.ndarray, sample_rate=16000):
137
  left_watch_idx = 0
138
  is_end = False
139
  sequences = segement_merge(segments)
140
-
141
  if len(sequences) > 2:
142
  logger.info(f"buffer clip via sequence, current length: {len(sequences)}")
143
  is_end = True
@@ -149,4 +150,3 @@ def sequences_split(segments, audio_buffer: np.ndarray, sample_rate=16000):
149
  return left_watch_idx, left_watch_sequences, right_watch_sequences, is_end
150
 
151
 
152
-
 
20
  """
21
  self.history.append((text, index))
22
 
23
+
24
  def get_final_index(self, similarity_threshold=0.7):
25
  """根据文本变化,返回可靠的标点的buffer的位置下标"""
26
  if len(self.history) < 2:
 
43
  @staticmethod
44
  def text_similarity(text1, text2):
45
  return SequenceMatcher(None, text1, text2).ratio()
46
+
47
 
48
 
49
  class SegmentManager:
 
55
  def handle(self, string):
56
  self._temp_string = string
57
  return self
58
+
59
  @property
60
  def short_sentence(self) -> str:
61
  return "".join(self._commited_short_sentences)
62
+
63
  @property
64
  def segment(self):
65
  return self._commited_segments[-1] if len(self._commited_segments) > 0 else ""
66
+
67
  def get_seg_id(self):
68
  return len(self._commited_segments)
69
+
70
  @property
71
  def string(self):
72
  return self._temp_string
73
+
74
+
75
  def commit_short_sentence(self):
76
  """将临时字符串 提交到临时短句"""
77
  self._commited_short_sentences.append(self._temp_string)
78
  self._temp_string = ""
79
+
80
  def commit_segment(self):
81
  """将短句 合并 到长句中"""
82
  self._commited_segments.append(self.short_sentence)
 
90
  self.commit_short_sentence()
91
  if is_end_sentence:
92
  self.commit_segment()
93
+
94
  def segement_merge(segments):
95
  """根据标点符号分整句"""
96
  sequences = []
 
118
  is_end = False
119
 
120
  for idx, seg in enumerate(segments):
121
+ print(">>>>>>>>>>>>>>>>>> seg: ", seg)
122
  left_watch_sequences.append(seg)
123
+ if seg.text and seg.text[-1] in markers:
124
  seg_index = int(seg.t1 / 100 * sample_rate)
125
  # rest_buffer_duration = (len(audio_buffer) - seg_index) / sample_rate
126
  # is_end = any(i in seg.text for i in config.SENTENCE_END_MARKERS)
 
138
  left_watch_idx = 0
139
  is_end = False
140
  sequences = segement_merge(segments)
141
+
142
  if len(sequences) > 2:
143
  logger.info(f"buffer clip via sequence, current length: {len(sequences)}")
144
  is_end = True
 
150
  return left_watch_idx, left_watch_sequences, right_watch_sequences, is_end
151
 
152