VoxSum / src /server /services /export_service.py
Luigi's picture
Fix filename sanitization for Chinese titles in export
72c7bbc
from __future__ import annotations
from datetime import datetime
from typing import Tuple
import re
from src.export_utils import (
SUBTITLE_FORMATS,
SUMMARY_FORMATS,
TRANSCRIPT_FORMATS,
export_plain_text,
)
from ..models.export import SummaryExportRequest, TranscriptExportRequest
def _sanitize_filename(title: str) -> str:
"""Sanitize title for use in filename"""
if not title:
return ""
# Remove or replace invalid filename characters
sanitized = re.sub(r'[<>:"/\\|?*]', '', title)
# Replace spaces and other problematic characters with underscores, but keep Unicode letters/numbers
sanitized = re.sub(r'[^\w\-_.]', '_', sanitized, flags=re.UNICODE)
# Remove multiple consecutive underscores
sanitized = re.sub(r'_+', '_', sanitized)
# Remove leading/trailing underscores
sanitized = sanitized.strip('_')
# Limit length
return sanitized[:50] if sanitized else ""
def _build_utterance_tuples(payload: TranscriptExportRequest):
utterances = [(u.start, u.end, u.text) for u in payload.utterances]
has_speakers = any(u.speaker is not None for u in payload.utterances)
utterances_with_speakers = None
if has_speakers:
utterances_with_speakers = [
(u.start, u.end, u.text, u.speaker if u.speaker is not None else 0)
for u in payload.utterances
]
return utterances, utterances_with_speakers
def generate_transcript_export(payload: TranscriptExportRequest) -> Tuple[str, str, str]:
utterances, utterances_with_speakers = _build_utterance_tuples(payload)
if payload.format in SUBTITLE_FORMATS:
fmt = SUBTITLE_FORMATS[payload.format]
content = fmt["function"](utterances, utterances_with_speakers)
elif payload.format in TRANSCRIPT_FORMATS:
fmt = TRANSCRIPT_FORMATS[payload.format]
if payload.format == "Plain Text":
content = export_plain_text(
utterances,
utterances_with_speakers,
include_timestamps=payload.include_timestamps,
)
else:
content = fmt["function"](utterances, utterances_with_speakers)
else:
raise ValueError(f"Unsupported transcript export format: {payload.format}")
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
title_part = f"_{_sanitize_filename(payload.title)}" if payload.title else ""
filename = f"transcript{title_part}_{timestamp}{fmt['extension']}"
return content, filename, fmt["mime_type"]
def generate_summary_export(payload: SummaryExportRequest) -> Tuple[str, str, str]:
if payload.format not in SUMMARY_FORMATS:
raise ValueError(f"Unsupported summary export format: {payload.format}")
fmt = SUMMARY_FORMATS[payload.format]
content = fmt["function"](payload.summary, payload.metadata)
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
title_part = f"_{_sanitize_filename(payload.title)}" if payload.title else ""
filename = f"summary{title_part}_{timestamp}{fmt['extension']}"
return content, filename, fmt["mime_type"]