#!/usr/bin/python3 # -*- coding: utf-8 -*- from typing import Callable, Iterable, List, Tuple from toolbox.sentence_segment.sent_tokenize import sent_tokenize def stream_to_char(stream: Iterable[str]): for chunk in stream: for char in chunk: yield char def stream_to_sentence(stream: Iterable[str], language: str = 'chinese', sent_tokenizer: str = "regex", fragment_punctuation: List[str] = None, ending_punctuation: List[str] = None, quick_yield_first_fragment: bool = False, quick_yield_all_fragment: bool = False, min_sentence_length: int = 10, min_fragment_length: int = 2, lookahead_context_size: int = 12, lookback_context_size: int = 12, space_between_sentence: bool = True, ) -> Iterable[str]: """ https://github.com/KoljaB/stream2sentence/blob/master/stream2sentence/stream2sentence.py """ fragment_punctuation = fragment_punctuation or list("\n….!?:;,。!?:;,") ending_punctuation = ending_punctuation or list("\n….!?。!?") if quick_yield_all_fragment: quick_yield_first_fragment = True wait_a_quick_yield: bool = quick_yield_first_fragment last_ending_position: int = -1 buffer = "" for char in stream_to_char(stream): if len(char) == 0: continue buffer += char buffer = buffer.lstrip() buffer_length = len(buffer) # quick yield fragment if wait_a_quick_yield and buffer_length > min_fragment_length: if buffer[-1] in fragment_punctuation: yield buffer buffer = "" if not quick_yield_all_fragment: wait_a_quick_yield = False continue # min length if buffer_length <= min_sentence_length + lookahead_context_size: continue if char in ending_punctuation: last_ending_position = buffer_length - 1 context_window_end_position = buffer_length - lookahead_context_size - 1 context_window_start_position = context_window_end_position - lookback_context_size if context_window_start_position < 0: context_window_start_position = 0 # sent tokenize sentences = sent_tokenize(buffer, language, sent_tokenizer) # 在按字符流式过程中, buffer 被切成2段时, 有可能是误切, 则需要检查结束标点符号的位置, 被切成3段时, 则认为上下文已经足够充分. condition1 = len(sentences) > 2 condition2 = len(sentences) > 1 and context_window_start_position <= last_ending_position <= context_window_end_position if condition1 or condition2: total_length_except_last = sum(len(sentence) for sentence in sentences[:-1]) if total_length_except_last >= min_sentence_length: for sentence in sentences[:-1]: yield sentence buffer = sentences[-1] last_ending_position = -1 # rest if len(buffer) > 0: sentences = sent_tokenize(buffer, language, sent_tokenizer) sentence_buffer = "" for sentence in sentences: sentence_buffer += sentence if len(sentence_buffer) < min_sentence_length: if space_between_sentence: sentence_buffer += " " continue yield sentence sentence_buffer = "" if len(sentence_buffer) > 0: yield sentence_buffer def main(): text = "讹言:“苍天已死,黄天当立;岁在甲子,天下大吉。”令人各以白土书“甲子”二字于家中大门上。" # text = "讹言:“苍天已死,黄天当立;岁在甲子,天下大吉。”" language = "chinese" sent_tokenizer = "regex" text = "Prof. Dr. Hadi, terima kasih atas kuliahnya. Dr. Sutanto, Bagaimana kabarnya?" language = "indonesian" sent_tokenizer = "stanza" stream = list(text) sentence_generator = stream_to_sentence( stream, language=language, sent_tokenizer=sent_tokenizer, # quick_yield_all_fragment=True, quick_yield_first_fragment=True, ) for sentence in sentence_generator: print(sentence) return if __name__ == "__main__": main()