| import re |
| import tiktoken |
| from typing import List, Tuple, Dict |
|
|
| class TextUtils: |
| """文本处理工具类""" |
| |
| @staticmethod |
| def count_tokens(text: str, model: str = "gpt-4") -> int: |
| """计算文本的token数量""" |
| try: |
| encoding = tiktoken.encoding_for_model(model) |
| return len(encoding.encode(text)) |
| except: |
| |
| |
| chinese_chars = len(re.findall(r'[\u4e00-\u9fff]', text)) |
| english_chars = len(text) - chinese_chars |
| return int(chinese_chars / 1.5 + english_chars / 4) |
| |
| @staticmethod |
| def clean_text(text: str) -> str: |
| """清理文本""" |
| |
| text = re.sub(r'\s+', ' ', text) |
| |
| text = re.sub(r'[^\w\s,.!?;:\'\"()\-—《》「」『』【】\u4e00-\u9fff]', '', text) |
| return text.strip() |
| |
| @staticmethod |
| def split_into_sentences(text: str) -> List[str]: |
| """分割句子""" |
| |
| |
| |
| sentences = re.split(r'[.!?。!?]+', text) |
| return [s.strip() for s in sentences if s.strip()] |
| |
| @staticmethod |
| def detect_language(text: str) -> str: |
| """检测文本语言""" |
| |
| chinese_chars = len(re.findall(r'[\u4e00-\u9fff]', text)) |
| |
| english_chars = len(re.findall(r'[a-zA-Z]', text)) |
| |
| total_chars = chinese_chars + english_chars |
| |
| if total_chars == 0: |
| return "unknown" |
| |
| chinese_ratio = chinese_chars / total_chars |
| |
| if chinese_ratio > 0.3: |
| return "zh" |
| elif chinese_ratio < 0.1: |
| return "en" |
| else: |
| return "mixed" |
| |
| @staticmethod |
| def extract_dialogues(text: str, language: str = "en") -> List[Dict]: |
| """提取对话""" |
| dialogues = [] |
| |
| if language == "zh": |
| |
| patterns = [ |
| r'"([^"]+)"[,,]?\s*([^说道讲告诉问答叫喊]*(?:说|道|讲|告诉|问|答|叫|喊))', |
| r'"([^"]+)"', |
| r'「([^」]+)」', |
| r'『([^』]+)』' |
| ] |
| else: |
| |
| patterns = [ |
| r'"([^"]+)",?\s*([^said]*(said|asked|replied|shouted|whispered|muttered))', |
| r'"([^"]+)"', |
| r"'([^']+)',?\s*([^said]*(said|asked|replied))", |
| r"'([^']+)'" |
| ] |
| |
| for pattern in patterns: |
| matches = re.finditer(pattern, text, re.IGNORECASE) |
| for match in matches: |
| dialogue = { |
| 'content': match.group(1), |
| 'attribution': match.group(2) if len(match.groups()) > 1 else '', |
| 'position': match.start() |
| } |
| dialogues.append(dialogue) |
| |
| return dialogues |
| |
| @staticmethod |
| def truncate_text(text: str, max_length: int, |
| ellipsis: str = "...") -> str: |
| """截断文本到指定长度""" |
| if len(text) <= max_length: |
| return text |
| |
| return text[:max_length - len(ellipsis)] + ellipsis |
| |
| @staticmethod |
| def extract_keywords(text: str, top_n: int = 10) -> List[str]: |
| """提取关键词(简单实现)""" |
| |
| words = re.findall(r'\b\w+\b', text.lower()) |
| |
| |
| stop_words = { |
| 'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', |
| 'of', 'with', 'by', 'from', 'as', 'is', 'was', 'are', 'were', 'be', |
| 'been', 'being', 'have', 'has', 'had', 'do', 'does', 'did', 'will', |
| 'would', 'could', 'should', 'may', 'might', 'can', 'this', 'that', |
| '的', '了', '在', '是', '我', '有', '和', '就', '不', '人', '都', '一', |
| '一个', '上', '也', '很', '到', '说', '要', '去', '你', '会', '着', '没有' |
| } |
| |
| |
| filtered_words = [w for w in words if w not in stop_words and len(w) > 2] |
| |
| |
| from collections import Counter |
| word_freq = Counter(filtered_words) |
| |
| |
| return [word for word, freq in word_freq.most_common(top_n)] |
| |
| @staticmethod |
| def estimate_reading_time(text: str, wpm: int = 200) -> int: |
| """估计阅读时间(分钟)""" |
| words = len(re.findall(r'\b\w+\b', text)) |
| chinese_chars = len(re.findall(r'[\u4e00-\u9fff]', text)) |
| |
| |
| reading_time = chinese_chars / 500 + words / wpm |
| |
| return max(1, int(reading_time)) |