daihui.zhang
		
	commited on
		
		
					Commit 
							
							·
						
						ce0e589
	
1
								Parent(s):
							
							3ec4a4f
								
add text length threhold
Browse files- config.py +2 -0
- tests/test_whisper_cpp.py +2 -2
- transcribe/pipelines/pipe_translate.py +7 -3
- transcribe/strategy.py +4 -4
    	
        config.py
    CHANGED
    
    | @@ -21,6 +21,8 @@ console_formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s | |
| 21 | 
             
            console_handler.setFormatter(console_formatter)
         | 
| 22 | 
             
            logging.getLogger().addHandler(console_handler)
         | 
| 23 |  | 
|  | |
|  | |
| 24 |  | 
| 25 | 
             
            BASE_DIR = pathlib.Path(__file__).parent
         | 
| 26 | 
             
            MODEL_DIR = BASE_DIR / "moyoyo_asr_models"
         | 
|  | |
| 21 | 
             
            console_handler.setFormatter(console_formatter)
         | 
| 22 | 
             
            logging.getLogger().addHandler(console_handler)
         | 
| 23 |  | 
| 24 | 
            +
            # 文字输出长度阈值
         | 
| 25 | 
            +
            TEXT_THREHOLD = 16
         | 
| 26 |  | 
| 27 | 
             
            BASE_DIR = pathlib.Path(__file__).parent
         | 
| 28 | 
             
            MODEL_DIR = BASE_DIR / "moyoyo_asr_models"
         | 
    	
        tests/test_whisper_cpp.py
    CHANGED
    
    | @@ -3,7 +3,7 @@ import config | |
| 3 | 
             
            import soundfile
         | 
| 4 | 
             
            from pywhispercpp.utils import to_timestamp
         | 
| 5 |  | 
| 6 | 
            -
            mel, _, = soundfile.read("/ | 
| 7 | 
             
            # mel, _, = soundfile.read(f"{config.ASSERT_DIR}/jfk.flac")
         | 
| 8 |  | 
| 9 | 
             
            models_dir = config.MODEL_DIR.as_posix()
         | 
| @@ -19,7 +19,7 @@ model = Model( | |
| 19 | 
             
                          no_context=True
         | 
| 20 | 
             
                          )
         | 
| 21 | 
             
            print(mel.shape, mel.dtype) # (160000,) float64
         | 
| 22 | 
            -
            segments = model.transcribe(mel | 
| 23 | 
             
                                        # initial_prompt="",# 'The following is an English sentence.', # "以下是简体中文句子。"
         | 
| 24 | 
             
                                        language='en',
         | 
| 25 | 
             
                                        # initial_prompt="以下是简体中文句子。",
         | 
|  | |
| 3 | 
             
            import soundfile
         | 
| 4 | 
             
            from pywhispercpp.utils import to_timestamp
         | 
| 5 |  | 
| 6 | 
            +
            mel, _, = soundfile.read("test/6_before_cut_56640.wav")
         | 
| 7 | 
             
            # mel, _, = soundfile.read(f"{config.ASSERT_DIR}/jfk.flac")
         | 
| 8 |  | 
| 9 | 
             
            models_dir = config.MODEL_DIR.as_posix()
         | 
|  | |
| 19 | 
             
                          no_context=True
         | 
| 20 | 
             
                          )
         | 
| 21 | 
             
            print(mel.shape, mel.dtype) # (160000,) float64
         | 
| 22 | 
            +
            segments = model.transcribe(mel,
         | 
| 23 | 
             
                                        # initial_prompt="",# 'The following is an English sentence.', # "以下是简体中文句子。"
         | 
| 24 | 
             
                                        language='en',
         | 
| 25 | 
             
                                        # initial_prompt="以下是简体中文句子。",
         | 
    	
        transcribe/pipelines/pipe_translate.py
    CHANGED
    
    | @@ -2,7 +2,7 @@ | |
| 2 | 
             
            from .base import MetaItem, BasePipe, Segment
         | 
| 3 | 
             
            from llama_cpp import Llama
         | 
| 4 | 
             
            from ..helpers.translator import QwenTranslator
         | 
| 5 | 
            -
            from config import LLM_MODEL_PATH, LLM_SYS_PROMPT_EN, LLM_SYS_PROMPT_ZH, LLM_LARGE_MODEL_PATH
         | 
| 6 |  | 
| 7 |  | 
| 8 | 
             
            class TranslatePipe(BasePipe):
         | 
| @@ -16,8 +16,12 @@ class TranslatePipe(BasePipe): | |
| 16 |  | 
| 17 | 
             
                def process(self, in_data: MetaItem) -> MetaItem:
         | 
| 18 | 
             
                    context = in_data.transcribe_content
         | 
| 19 | 
            -
                     | 
| 20 | 
            -
             | 
|  | |
|  | |
|  | |
|  | |
| 21 | 
             
                    in_data.translate_content = result
         | 
| 22 | 
             
                    return in_data
         | 
| 23 |  | 
|  | |
| 2 | 
             
            from .base import MetaItem, BasePipe, Segment
         | 
| 3 | 
             
            from llama_cpp import Llama
         | 
| 4 | 
             
            from ..helpers.translator import QwenTranslator
         | 
| 5 | 
            +
            from config import LLM_MODEL_PATH, LLM_SYS_PROMPT_EN, LLM_SYS_PROMPT_ZH, LLM_LARGE_MODEL_PATH, ALL_MARKERS
         | 
| 6 |  | 
| 7 |  | 
| 8 | 
             
            class TranslatePipe(BasePipe):
         | 
|  | |
| 16 |  | 
| 17 | 
             
                def process(self, in_data: MetaItem) -> MetaItem:
         | 
| 18 | 
             
                    context = in_data.transcribe_content
         | 
| 19 | 
            +
                    all_punctuatioin = all([ch in ALL_MARKERS for ch in context])
         | 
| 20 | 
            +
                    if all_punctuatioin:
         | 
| 21 | 
            +
                        result = ""
         | 
| 22 | 
            +
                    else:
         | 
| 23 | 
            +
                        result = self.translator.translate(
         | 
| 24 | 
            +
                            context, src_lang=in_data.source_language, dst_lang=in_data.destination_language)
         | 
| 25 | 
             
                    in_data.translate_content = result
         | 
| 26 | 
             
                    return in_data
         | 
| 27 |  | 
    	
        transcribe/strategy.py
    CHANGED
    
    | @@ -8,7 +8,7 @@ from typing import List, Tuple, Optional, Deque, Any, Iterator,Literal | |
| 8 | 
             
            from config import SENTENCE_END_MARKERS, ALL_MARKERS,SENTENCE_END_PATTERN,REGEX_MARKERS, PAUSEE_END_PATTERN,SAMPLE_RATE
         | 
| 9 | 
             
            from enum import Enum
         | 
| 10 | 
             
            import wordninja
         | 
| 11 | 
            -
             | 
| 12 | 
             
            import re
         | 
| 13 | 
             
            logger = logging.getLogger("TranscriptionStrategy")
         | 
| 14 |  | 
| @@ -199,7 +199,7 @@ class TranscriptBuffer: | |
| 199 |  | 
| 200 | 
             
                    count = 0
         | 
| 201 | 
             
                    current_sentences = []
         | 
| 202 | 
            -
                    while len(self._sentences) and count < 20:
         | 
| 203 | 
             
                        item = self._sentences.popleft()
         | 
| 204 | 
             
                        current_sentences.append(item)
         | 
| 205 | 
             
                        if self._separator:
         | 
| @@ -265,10 +265,10 @@ class TranscriptBuffer: | |
| 265 | 
             
                            self.update_pending_text(stable_str)
         | 
| 266 | 
             
                            self.commit_line()
         | 
| 267 |  | 
| 268 | 
            -
                        current_text_len = | 
| 269 | 
             
                        # current_text_len = len(self.current_not_commit_text.split(self._separator))
         | 
| 270 | 
             
                        self.update_pending_text(remaining_string)
         | 
| 271 | 
            -
                        if current_text_len >=  | 
| 272 | 
             
                            self.commit_paragraph()
         | 
| 273 | 
             
                            self._current_seg_id += 1
         | 
| 274 | 
             
                            return True
         | 
|  | |
| 8 | 
             
            from config import SENTENCE_END_MARKERS, ALL_MARKERS,SENTENCE_END_PATTERN,REGEX_MARKERS, PAUSEE_END_PATTERN,SAMPLE_RATE
         | 
| 9 | 
             
            from enum import Enum
         | 
| 10 | 
             
            import wordninja
         | 
| 11 | 
            +
            import config
         | 
| 12 | 
             
            import re
         | 
| 13 | 
             
            logger = logging.getLogger("TranscriptionStrategy")
         | 
| 14 |  | 
|  | |
| 199 |  | 
| 200 | 
             
                    count = 0
         | 
| 201 | 
             
                    current_sentences = []
         | 
| 202 | 
            +
                    while len(self._sentences): # and count < 20:
         | 
| 203 | 
             
                        item = self._sentences.popleft()
         | 
| 204 | 
             
                        current_sentences.append(item)
         | 
| 205 | 
             
                        if self._separator:
         | 
|  | |
| 265 | 
             
                            self.update_pending_text(stable_str)
         | 
| 266 | 
             
                            self.commit_line()
         | 
| 267 |  | 
| 268 | 
            +
                        current_text_len = len(self.current_not_commit_text.split(self._separator)) if self._separator else len(self.current_not_commit_text)
         | 
| 269 | 
             
                        # current_text_len = len(self.current_not_commit_text.split(self._separator))
         | 
| 270 | 
             
                        self.update_pending_text(remaining_string)
         | 
| 271 | 
            +
                        if current_text_len >= config.TEXT_THREHOLD:
         | 
| 272 | 
             
                            self.commit_paragraph()
         | 
| 273 | 
             
                            self._current_seg_id += 1
         | 
| 274 | 
             
                            return True
         |