Upload folder using huggingface_hub

a226682 verified 4 months ago

5.13 kB

	import re
	import tiktoken
	from typing import List, Tuple, Dict # 添加 Dict

	class TextUtils:
	"""文本处理工具类"""

	@staticmethod
	def count_tokens(text: str, model: str = "gpt-4") -> int:
	"""计算文本的token数量"""
	try:
	encoding = tiktoken.encoding_for_model(model)
	return len(encoding.encode(text))
	except:
	# 粗略估计：英文约4字符=1token，中文约1.5字符=1token
	# 使用更保守的估计
	chinese_chars = len(re.findall(r'[\u4e00-\u9fff]', text))
	english_chars = len(text) - chinese_chars
	return int(chinese_chars / 1.5 + english_chars / 4)

	@staticmethod
	def clean_text(text: str) -> str:
	"""清理文本"""
	# 移除多余的空白
	text = re.sub(r'\s+', ' ', text)
	# 移除特殊字符（保留基本标点）
	text = re.sub(r'[^\w\s,.!?;:\'\"()\-—《》「」『』【】\u4e00-\u9fff]', '', text)
	return text.strip()

	@staticmethod
	def split_into_sentences(text: str) -> List[str]:
	"""分割句子"""
	# 支持中英文句子分割
	# 英文句号、问号、感叹号
	# 中文句号、问号、感叹号
	sentences = re.split(r'[.!?。！？]+', text)
	return [s.strip() for s in sentences if s.strip()]

	@staticmethod
	def detect_language(text: str) -> str:
	"""检测文本语言"""
	# 统计中文字符
	chinese_chars = len(re.findall(r'[\u4e00-\u9fff]', text))
	# 统计英文字符
	english_chars = len(re.findall(r'[a-zA-Z]', text))

	total_chars = chinese_chars + english_chars

	if total_chars == 0:
	return "unknown"

	chinese_ratio = chinese_chars / total_chars

	if chinese_ratio > 0.3:
	return "zh"
	elif chinese_ratio < 0.1:
	return "en"
	else:
	return "mixed"

	@staticmethod
	def extract_dialogues(text: str, language: str = "en") -> List[Dict]:
	"""提取对话"""
	dialogues = []

	if language == "zh":
	# 中文对话模式：引号内的内容
	patterns = [
	r'"([^"]+)"[,，]?\s([^说道讲告诉问答叫喊](?:说\|道\|讲\|告诉\|问\|答\|叫\|喊))',
	r'"([^"]+)"',
	r'「([^」]+)」',
	r'『([^』]+)』'
	]
	else:
	# 英文对话模式
	patterns = [
	r'"([^"]+)",?\s([^said](said\|asked\|replied\|shouted\|whispered\|muttered))',
	r'"([^"]+)"',
	r"'([^']+)',?\s([^said](said\|asked\|replied))",
	r"'([^']+)'"
	]

	for pattern in patterns:
	matches = re.finditer(pattern, text, re.IGNORECASE)
	for match in matches:
	dialogue = {
	'content': match.group(1),
	'attribution': match.group(2) if len(match.groups()) > 1 else '',
	'position': match.start()
	}
	dialogues.append(dialogue)

	return dialogues

	@staticmethod
	def truncate_text(text: str, max_length: int,
	ellipsis: str = "...") -> str:
	"""截断文本到指定长度"""
	if len(text) <= max_length:
	return text

	return text[:max_length - len(ellipsis)] + ellipsis

	@staticmethod
	def extract_keywords(text: str, top_n: int = 10) -> List[str]:
	"""提取关键词（简单实现）"""
	# 移除标点和停用词
	words = re.findall(r'\b\w+\b', text.lower())

	# 简单的停用词列表
	stop_words = {
	'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for',
	'of', 'with', 'by', 'from', 'as', 'is', 'was', 'are', 'were', 'be',
	'been', 'being', 'have', 'has', 'had', 'do', 'does', 'did', 'will',
	'would', 'could', 'should', 'may', 'might', 'can', 'this', 'that',
	'的', '了', '在', '是', '我', '有', '和', '就', '不', '人', '都', '一',
	'一个', '上', '也', '很', '到', '说', '要', '去', '你', '会', '着', '没有'
	}

	# 过滤停用词
	filtered_words = [w for w in words if w not in stop_words and len(w) > 2]

	# 统计词频
	from collections import Counter
	word_freq = Counter(filtered_words)

	# 返回最常见的词
	return [word for word, freq in word_freq.most_common(top_n)]

	@staticmethod
	def estimate_reading_time(text: str, wpm: int = 200) -> int:
	"""估计阅读时间（分钟）"""
	words = len(re.findall(r'\b\w+\b', text))
	chinese_chars = len(re.findall(r'[\u4e00-\u9fff]', text))

	# 中文按字符数/500，英文按单词数/wpm
	reading_time = chinese_chars / 500 + words / wpm

	return max(1, int(reading_time))