|
import re
|
|
import emoji
|
|
|
|
def prepare_tts_input_with_context(text: str) -> str:
|
|
"""
|
|
Prepares text for a TTS API by cleaning Markdown and adding minimal contextual hints
|
|
for certain Markdown elements like headers. Preserves paragraph separation.
|
|
|
|
Args:
|
|
text (str): The raw text containing Markdown or other formatting.
|
|
|
|
Returns:
|
|
str: Cleaned text with contextual hints suitable for TTS input.
|
|
"""
|
|
|
|
|
|
text = emoji.replace_emoji(text, replace='')
|
|
|
|
|
|
def header_replacer(match):
|
|
level = len(match.group(1))
|
|
header_text = match.group(2).strip()
|
|
if level == 1:
|
|
return f"Title — {header_text}\n"
|
|
elif level == 2:
|
|
return f"Section — {header_text}\n"
|
|
else:
|
|
return f"Subsection — {header_text}\n"
|
|
|
|
text = re.sub(r"^(#{1,6})\s+(.*)", header_replacer, text, flags=re.MULTILINE)
|
|
|
|
|
|
|
|
|
|
|
|
text = re.sub(r"\[([^\]]+)\]\([^\)]+\)", r"\1", text)
|
|
|
|
|
|
text = re.sub(r"`([^`]+)`", r"code snippet: \1", text)
|
|
|
|
|
|
text = re.sub(r"(\*\*|__|\*|_)", '', text)
|
|
|
|
|
|
text = re.sub(r"```([\s\S]+?)```", r"(code block omitted)", text)
|
|
|
|
|
|
text = re.sub(r"!\[([^\]]*)\]\([^\)]+\)", r"Image: \1", text)
|
|
|
|
|
|
text = re.sub(r"</?[^>]+(>|$)", '', text)
|
|
|
|
|
|
text = re.sub(r"\n{2,}", '\n\n', text)
|
|
|
|
|
|
text = re.sub(r" {2,}", ' ', text)
|
|
|
|
|
|
text = text.strip()
|
|
|
|
return text
|
|
|