#!/usr/bin/env python3 """ Hugging Face Space 首页 - MOSS-TTSD 参考 fnlp/MOSS-TTSD Space 的实现,并结合本仓 UI 与文档做了增强: - 默认中文界面,保留简洁工作流 - 提供场景选择与一键加载 - 支持文本规范化选项 - 右侧提供简明的使用说明与文档链接 如需在本地运行本 Space 脚本: python hf_space/app.py """ import os import json import time import shutil import tempfile from typing import Optional, Tuple import gradio as gr import torch import torchaudio # HF Spaces GPU 调度 try: import spaces # 在HF空间中可用,本地不存在也不影响 except Exception: # noqa: BLE001 class _DummySpaces: # 兜底占位,以便本地运行不报错 def GPU(self, *args, **kwargs): # type: ignore[override] def deco(fn): return fn return deco spaces = _DummySpaces() # type: ignore from huggingface_hub import hf_hub_download # 复用本仓通用推理工具 from generation_utils import load_model, process_batch # ========================= # 配置 # ========================= SYSTEM_PROMPT = ( "You are a speech synthesizer that generates natural, realistic, and human-like conversational audio from dialogue text." ) # 场景配置映射 SCENARIO_CONFIG = { "科技播客_AI发展": { "title": "🤖 科技播客 - AI发展趋势", "description": "探讨人工智能的最新发展与未来趋势", "file": "scenarios/科技播客_AI发展.jsonl" }, "教育播客_学习方法": { "title": "📚 教育播客 - 高效学习方法", "description": "分享科学的学习方法与技巧", "file": "scenarios/教育播客_学习方法.jsonl" }, "生活播客_美食文化": { "title": "🍜 生活播客 - 美食文化探索", "description": "品味各地美食文化的魅力", "file": "scenarios/生活播客_美食文化.jsonl" }, "商业播客_创业经验": { "title": "💼 商业播客 - 创业经验分享", "description": "创业路上的经验教训与心得", "file": "scenarios/商业播客_创业经验.jsonl" }, "健康播客_运动健身": { "title": "🏃 健康播客 - 运动健身指南", "description": "科学健身与健康生活方式", "file": "scenarios/健康播客_运动健身.jsonl" }, "心理播客_情绪管理": { "title": "🧠 心理播客 - 情绪管理技巧", "description": "探索情绪管理与心理健康", "file": "scenarios/心理播客_情绪管理.jsonl" } } # 默认音频配置 DEFAULT_AUDIO_CONFIG = { "speaker1": { "audio": "examples/zh_spk1_moon.wav", "text": "周一到周五,每天早晨七点半到九点半的直播片段。言下之意呢,就是废话有点多,大家也别嫌弃,因为这都是直播间最真实的状态了。" }, "speaker2": { "audio": "examples/zh_spk2_moon.wav", "text": "如果大家想听到更丰富更及时的直播内容,记得在周一到周五准时进入直播间,和大家一起畅聊新消费新科技新趋势。" } } MODEL_PATH = "fnlp/MOSS-TTSD-v0.5" SPT_CONFIG_PATH = "XY_Tokenizer/config/xy_tokenizer_config.yaml" # 自动下载 XY_Tokenizer 权重到本地缓存(HF Space 会复用缓存) os.makedirs("XY_Tokenizer/weights", exist_ok=True) try: SPT_CHECKPOINT_PATH = hf_hub_download( repo_id="fnlp/XY_Tokenizer_TTSD_V0", filename="xy_tokenizer.ckpt", cache_dir="XY_Tokenizer/weights", ) except Exception as e: # noqa: BLE001 # 失败时保留占位路径,稍后初始化时再提示 print(f"⚠️ XY_Tokenizer 权重下载失败: {e}") SPT_CHECKPOINT_PATH = "XY_Tokenizer/weights/xy_tokenizer.ckpt" # 全局缓存 tokenizer = None model = None spt = None device = None # ========================= # 工具函数 # ========================= def get_scenario_examples(): """获取所有可用的场景示例,整合 JSON 文件和默认配置""" scenarios = {} # 加载 JSON 文件场景 for key, config in SCENARIO_CONFIG.items(): try: file_path = config["file"] print(f"🔍 检查场景文件: {file_path}") if os.path.exists(file_path): with open(file_path, "r", encoding="utf-8") as f: data = json.load(f) scenarios[config["title"]] = { "text": data.get("text", ""), "description": config["description"], "audio1": data.get("prompt_audio_speaker1", ""), "text1": data.get("prompt_text_speaker1", ""), "audio2": data.get("prompt_audio_speaker2", ""), "text2": data.get("prompt_text_speaker2", ""), "base_path": data.get("base_path", ""), } print(f"✅ 成功加载场景: {config['title']}") else: print(f"❌ 场景文件不存在: {file_path}") except Exception as e: print(f"⚠️ 加载场景 {key} 失败: {e}") # 添加默认示例(确保总有可用场景) scenarios["🎧 默认示例"] = { "text": ( "[S1]大家好,欢迎收听今天的节目,我是主播小雨。" "[S2]大家好,我是嘉宾阿明,很高兴和大家见面。" "[S1]今天我们要聊的话题非常有趣,相信大家会喜欢的。" "[S2]是的,让我们开始今天的精彩内容吧!" ), "description": "默认的示例对话,适合快速体验", "audio1": DEFAULT_AUDIO_CONFIG["speaker1"]["audio"], "text1": DEFAULT_AUDIO_CONFIG["speaker1"]["text"], "audio2": DEFAULT_AUDIO_CONFIG["speaker2"]["audio"], "text2": DEFAULT_AUDIO_CONFIG["speaker2"]["text"], "base_path": "", } print(f"📊 总共加载了 {len(scenarios)} 个场景") return scenarios def load_scenario_data(scenario_key: str): """加载场景数据,确保音频和文本一一对应""" if scenario_key not in SCENARIO_CONFIG: return None, None, None, None, None try: scenario_file = SCENARIO_CONFIG[scenario_key]["file"] if not os.path.exists(scenario_file): return None, None, None, None, None with open(scenario_file, "r", encoding="utf-8") as f: data = json.load(f) # 确保音频文件路径正确 audio1_path = data.get("prompt_audio_speaker1", "") audio2_path = data.get("prompt_audio_speaker2", "") if audio1_path and not audio1_path.startswith("/"): audio1_path = os.path.join(data.get("base_path", ""), audio1_path) if audio2_path and not audio2_path.startswith("/"): audio2_path = os.path.join(data.get("base_path", ""), audio2_path) return ( data.get("text", ""), audio1_path if os.path.exists(audio1_path) else None, data.get("prompt_text_speaker1", ""), audio2_path if os.path.exists(audio2_path) else None, data.get("prompt_text_speaker2", "") ) except Exception as e: print(f"❌ 加载场景失败: {e}") return None, None, None, None, None def load_default_audio(): """加载默认音频和文本,确保音频文件存在""" audio1 = DEFAULT_AUDIO_CONFIG["speaker1"]["audio"] text1 = DEFAULT_AUDIO_CONFIG["speaker1"]["text"] audio2 = DEFAULT_AUDIO_CONFIG["speaker2"]["audio"] text2 = DEFAULT_AUDIO_CONFIG["speaker2"]["text"] # 默认对话文本 default_text = ( "[S1]大家好,欢迎收听今天的节目,我是主播小雨。" "[S2]大家好,我是嘉宾阿明,很高兴和大家见面。" "[S1]今天我们要聊的话题非常有趣,相信大家会喜欢的。" "[S2]是的,让我们开始今天的精彩内容吧!" ) # 检查音频文件是否存在 audio1_exists = os.path.exists(audio1) audio2_exists = os.path.exists(audio2) print(f"🔍 默认音频检查: {audio1}={audio1_exists}, {audio2}={audio2_exists}") # 如果文件存在,返回绝对路径;否则返回None audio1_path = os.path.abspath(audio1) if audio1_exists else None audio2_path = os.path.abspath(audio2) if audio2_exists else None print(f"🎵 返回音频路径: audio1={audio1_path}, audio2={audio2_path}") return ( default_text, audio1_path, text1, audio2_path, text2 ) def initialize_model(): global tokenizer, model, spt, device if tokenizer is not None: return tokenizer, model, spt, device device = "cuda" if torch.cuda.is_available() else "cpu" print(f"🔧 初始化模型,设备: {device}") if not os.path.exists(SPT_CHECKPOINT_PATH): raise FileNotFoundError( "未找到 XY_Tokenizer 权重,请检查网络或手动放置到 XY_Tokenizer/weights/xy_tokenizer.ckpt" ) tokenizer, model, spt = load_model( MODEL_PATH, SPT_CONFIG_PATH, SPT_CHECKPOINT_PATH, ) model = model.to(device) spt = spt.to(device) # 设置稳定的生成参数,避免数值不稳定 try: # 优化生成长度,平衡质量与速度 model.generation_config.max_new_tokens = min( getattr(model.generation_config, "max_new_tokens", 1024), 1024 # 减少默认长度,提升速度 ) # 使用文档推荐的"轻松对话风格"参数组合,确保数值稳定 model.generation_config.do_sample = True model.generation_config.temperature = 1.0 # 恢复默认值,避免数值不稳定 model.generation_config.top_k = 50 # 添加top_k限制 model.generation_config.top_p = 0.9 # 保持合理的nucleus采样 model.generation_config.repetition_penalty = 1.1 # 避免重复 model.generation_config.num_beams = 1 # 使用贪心搜索 # 添加数值稳定性保护 model.generation_config.epsilon = 1e-8 # 防止除零错误 model.generation_config.pad_token_id = model.config.eos_token_id print(f"🚀 应用稳定生成参数: temp={model.generation_config.temperature}, top_k={model.generation_config.top_k}, top_p={model.generation_config.top_p}") except Exception as e: # noqa: BLE001 print(f"⚠️ 生成参数设置失败: {e}") pass print("✅ 模型初始化完成!") return tokenizer, model, spt, device # ========================= # 推理函数(供 UI 调用) # ========================= @spaces.GPU(duration=60) # 减少GPU持续时间,提升响应速度 def generate_dialogue_audio( dialogue_text: str, speaker1_audio: Optional[str], speaker1_text: str, speaker2_audio: Optional[str], speaker2_text: str, use_normalize: bool, temperature: float = 1.0, top_k: int = 50, top_p: float = 0.9, repetition_penalty: float = 1.1, max_new_tokens: int = 2048, do_sample: bool = True, ) -> Tuple[Optional[str], str]: try: if not dialogue_text or not dialogue_text.strip(): return None, "❌ 请输入对话文本" # 检查音频输入 if not speaker1_audio and not speaker2_audio: return None, "💡 页面应该已自动加载默认音频,如未加载请点击 '🎧 默认音频' 按钮,或上传您自己的参考音频文件!" # 初始化模型,显示进度 tokenizer, model, spt, device = initialize_model() # 应用用户选择的生成参数,覆盖默认设置 print(f"🎛️ 应用用户参数: temp={temperature}, top_k={top_k}, top_p={top_p}, penalty={repetition_penalty}") model.generation_config.temperature = temperature model.generation_config.top_k = top_k model.generation_config.top_p = top_p model.generation_config.repetition_penalty = repetition_penalty model.generation_config.max_new_tokens = min(max_new_tokens, 4096) # 安全限制 model.generation_config.do_sample = do_sample # 根据输入拼装 item(process_batch 兼容单/双说话者) item = {"text": dialogue_text} if speaker1_audio and speaker2_audio: item.update( { "prompt_audio_speaker1": speaker1_audio, "prompt_text_speaker1": speaker1_text or "", "prompt_audio_speaker2": speaker2_audio, "prompt_text_speaker2": speaker2_text or "", } ) else: # 单音频模式 single_audio = speaker1_audio or speaker2_audio single_text = speaker1_text or speaker2_text or "" item.update({"prompt_audio": single_audio, "prompt_text": single_text}) # 执行合成,添加重试机制 try: actual_texts_data, audio_results = process_batch( batch_items=[item], tokenizer=tokenizer, model=model, spt=spt, device=device, system_prompt=SYSTEM_PROMPT, start_idx=0, use_normalize=use_normalize, ) except RuntimeError as e: if "probability tensor contains" in str(e): print("⚠️ 检测到数值不稳定,尝试使用确定性生成...") # 临时切换到确定性生成 original_do_sample = model.generation_config.do_sample model.generation_config.do_sample = False try: actual_texts_data, audio_results = process_batch( batch_items=[item], tokenizer=tokenizer, model=model, spt=spt, device=device, system_prompt=SYSTEM_PROMPT, start_idx=0, use_normalize=use_normalize, ) finally: # 恢复原设置 model.generation_config.do_sample = original_do_sample else: raise e if not audio_results or audio_results[0] is None: return None, "❌ 音频生成失败" audio_result = audio_results[0] out_path = tempfile.NamedTemporaryFile(suffix=".wav", delete=False).name torchaudio.save(out_path, audio_result["audio_data"], audio_result["sample_rate"]) # type: ignore[index] status = ( f"✅ 生成成功!\n\n" f"📊 音频信息:\n" f"- 采样率: {audio_result['sample_rate']} Hz\n" f"- 时长: {audio_result['audio_data'].shape[-1] / audio_result['sample_rate']:.2f} 秒\n" f"- 通道数: {audio_result['audio_data'].shape[0]}\n\n" f"📝 文本处理:\n" f"- 是否规范化: {use_normalize}\n" ) return out_path, status except Exception as e: # noqa: BLE001 import traceback return None, f"❌ 生成出错: {e}\n\n{traceback.format_exc()}" # ========================= # UI 构建 # ========================= def create_space_ui() -> gr.Blocks: # Apple-inspired 暗色主题设计 custom_css = """ /* 全局样式 - Apple暗色风格 */ .gradio-container { max-width: 1400px !important; margin: 0 auto !important; font-family: -apple-system, BlinkMacSystemFont, 'SF Pro Display', system-ui, sans-serif !important; background: #0d1117 !important; --primary-color: #0969da; --primary-hover: #1f6feb; --surface-color: #161b22; --surface-secondary: #21262d; --border-color: #30363d; --border-secondary: #21262d; --text-primary: #f0f6fc; --text-secondary: #8b949e; --text-muted: #656d76; --success-color: #238636; --shadow: 0 4px 16px rgba(0,0,0,0.4); --shadow-elevated: 0 8px 32px rgba(0,0,0,0.6); --radius: 12px; } /* 主标题区域 */ .header { text-align: center; margin-bottom: 2rem; background: var(--surface-color); padding: 3rem 2rem; border-radius: var(--radius); color: var(--text-primary); box-shadow: var(--shadow); border: 1px solid var(--border-color); } .header h1 { font-size: 2.75rem; margin: 0 0 0.5rem 0; font-weight: 700; letter-spacing: -0.02em; color: var(--text-primary); } .header p { font-size: 1.1rem; margin: 0; color: var(--text-secondary); font-weight: 400; } /* 卡片组件 - 暗色主题 */ .section { background: var(--surface-color); border-radius: var(--radius); padding: 2rem; border: 1px solid var(--border-color); margin: 1rem 0; box-shadow: var(--shadow); transition: all 0.2s ease; } .section:hover { box-shadow: var(--shadow-elevated); transform: translateY(-2px); border-color: var(--primary-color); } /* 按钮样式 - 暗色主题 */ .quick-btn { background: var(--primary-color) !important; border: none !important; color: var(--text-primary) !important; font-weight: 600 !important; border-radius: var(--radius) !important; padding: 0.875rem 2rem !important; transition: all 0.2s ease !important; } .quick-btn:hover { background: var(--primary-hover) !important; transform: translateY(-1px) !important; box-shadow: 0 8px 24px rgba(9,105,218,0.4) !important; } .generate-btn { background: var(--primary-color) !important; border: none !important; color: var(--text-primary) !important; font-weight: 700 !important; font-size: 1.1rem !important; border-radius: var(--radius) !important; padding: 1rem 2rem !important; width: 100% !important; transition: all 0.2s ease !important; box-shadow: var(--shadow) !important; } .generate-btn:hover { background: var(--primary-hover) !important; transform: translateY(-2px) !important; box-shadow: var(--shadow-elevated) !important; } .speaker-section { background: var(--surface-secondary); padding: 1.5rem; border-radius: var(--radius); border: 1px solid var(--border-color); } /* Gradio 组件暗色主题覆盖 */ .gradio-container .gr-textbox, .gradio-container .gr-textarea, .gradio-container .gr-dropdown, .gradio-container .gr-audio, .gradio-container .gr-slider, .gradio-container .gr-checkbox, .gradio-container .gr-accordion { background: var(--surface-color) !important; border: 1px solid var(--border-color) !important; color: var(--text-primary) !important; border-radius: var(--radius) !important; } .gradio-container .gr-textbox:focus, .gradio-container .gr-textarea:focus, .gradio-container .gr-dropdown:focus { border-color: var(--primary-color) !important; box-shadow: 0 0 0 3px rgba(9,105,218,0.2) !important; } /* 文本和标签暗色主题 */ .gradio-container .gr-markdown, .gradio-container .gr-markdown *, .gradio-container label, .gradio-container p, .gradio-container span { color: var(--text-primary) !important; } .gradio-container .gr-markdown code { background: var(--surface-secondary) !important; color: var(--text-primary) !important; border-radius: 4px !important; padding: 2px 6px !important; } /* 按钮统一暗色主题 */ .gradio-container .gr-button { background: var(--surface-color) !important; border: 1px solid var(--border-color) !important; color: var(--text-primary) !important; border-radius: var(--radius) !important; } .gradio-container .gr-button:hover { background: var(--surface-secondary) !important; border-color: var(--primary-color) !important; } .gradio-container .gr-button.primary { background: var(--primary-color) !important; border: none !important; color: var(--text-primary) !important; } .gradio-container .gr-button.primary:hover { background: var(--primary-hover) !important; } """ with gr.Blocks(css=custom_css, title="🎙️ MOSS-TTSD | Hugging Face Space", theme="dark") as demo: gr.HTML( """
零样本双说话者对话合成 · 默认中文界面 · 一键加载场景