|
|
|
""" |
|
Hugging Face Space 首页 - MOSS-TTSD |
|
参考 fnlp/MOSS-TTSD Space 的实现,并结合本仓 UI 与文档做了增强: |
|
- 默认中文界面,保留简洁工作流 |
|
- 提供场景选择与一键加载 |
|
- 支持文本规范化选项 |
|
- 右侧提供简明的使用说明与文档链接 |
|
|
|
如需在本地运行本 Space 脚本: |
|
python hf_space/app.py |
|
""" |
|
import os |
|
import json |
|
import time |
|
import shutil |
|
import tempfile |
|
from typing import Optional, Tuple |
|
|
|
import gradio as gr |
|
import torch |
|
import torchaudio |
|
|
|
|
|
try: |
|
import spaces |
|
except Exception: |
|
class _DummySpaces: |
|
def GPU(self, *args, **kwargs): |
|
def deco(fn): |
|
return fn |
|
return deco |
|
spaces = _DummySpaces() |
|
|
|
from huggingface_hub import hf_hub_download |
|
|
|
|
|
from generation_utils import load_model, process_batch |
|
|
|
|
|
|
|
|
|
|
|
SYSTEM_PROMPT = ( |
|
"You are a speech synthesizer that generates natural, realistic, and human-like conversational audio from dialogue text." |
|
) |
|
|
|
|
|
SCENARIO_CONFIG = { |
|
"科技播客_AI发展": { |
|
"title": "🤖 科技播客 - AI发展趋势", |
|
"description": "探讨人工智能的最新发展与未来趋势", |
|
"file": "scenarios/科技播客_AI发展.jsonl" |
|
}, |
|
"教育播客_学习方法": { |
|
"title": "📚 教育播客 - 高效学习方法", |
|
"description": "分享科学的学习方法与技巧", |
|
"file": "scenarios/教育播客_学习方法.jsonl" |
|
}, |
|
"生活播客_美食文化": { |
|
"title": "🍜 生活播客 - 美食文化探索", |
|
"description": "品味各地美食文化的魅力", |
|
"file": "scenarios/生活播客_美食文化.jsonl" |
|
}, |
|
"商业播客_创业经验": { |
|
"title": "💼 商业播客 - 创业经验分享", |
|
"description": "创业路上的经验教训与心得", |
|
"file": "scenarios/商业播客_创业经验.jsonl" |
|
}, |
|
"健康播客_运动健身": { |
|
"title": "🏃 健康播客 - 运动健身指南", |
|
"description": "科学健身与健康生活方式", |
|
"file": "scenarios/健康播客_运动健身.jsonl" |
|
}, |
|
"心理播客_情绪管理": { |
|
"title": "🧠 心理播客 - 情绪管理技巧", |
|
"description": "探索情绪管理与心理健康", |
|
"file": "scenarios/心理播客_情绪管理.jsonl" |
|
} |
|
} |
|
|
|
|
|
DEFAULT_AUDIO_CONFIG = { |
|
"speaker1": { |
|
"audio": "examples/zh_spk1_moon.wav", |
|
"text": "周一到周五,每天早晨七点半到九点半的直播片段。言下之意呢,就是废话有点多,大家也别嫌弃,因为这都是直播间最真实的状态了。" |
|
}, |
|
"speaker2": { |
|
"audio": "examples/zh_spk2_moon.wav", |
|
"text": "如果大家想听到更丰富更及时的直播内容,记得在周一到周五准时进入直播间,和大家一起畅聊新消费新科技新趋势。" |
|
} |
|
} |
|
MODEL_PATH = "fnlp/MOSS-TTSD-v0.5" |
|
SPT_CONFIG_PATH = "XY_Tokenizer/config/xy_tokenizer_config.yaml" |
|
|
|
|
|
os.makedirs("XY_Tokenizer/weights", exist_ok=True) |
|
try: |
|
SPT_CHECKPOINT_PATH = hf_hub_download( |
|
repo_id="fnlp/XY_Tokenizer_TTSD_V0", |
|
filename="xy_tokenizer.ckpt", |
|
cache_dir="XY_Tokenizer/weights", |
|
) |
|
except Exception as e: |
|
|
|
print(f"⚠️ XY_Tokenizer 权重下载失败: {e}") |
|
SPT_CHECKPOINT_PATH = "XY_Tokenizer/weights/xy_tokenizer.ckpt" |
|
|
|
|
|
tokenizer = None |
|
model = None |
|
spt = None |
|
device = None |
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_scenario_examples(): |
|
"""获取所有可用的场景示例,整合 JSON 文件和默认配置""" |
|
scenarios = {} |
|
|
|
|
|
for key, config in SCENARIO_CONFIG.items(): |
|
try: |
|
file_path = config["file"] |
|
print(f"🔍 检查场景文件: {file_path}") |
|
if os.path.exists(file_path): |
|
with open(file_path, "r", encoding="utf-8") as f: |
|
data = json.load(f) |
|
scenarios[config["title"]] = { |
|
"text": data.get("text", ""), |
|
"description": config["description"], |
|
"audio1": data.get("prompt_audio_speaker1", ""), |
|
"text1": data.get("prompt_text_speaker1", ""), |
|
"audio2": data.get("prompt_audio_speaker2", ""), |
|
"text2": data.get("prompt_text_speaker2", ""), |
|
"base_path": data.get("base_path", ""), |
|
} |
|
print(f"✅ 成功加载场景: {config['title']}") |
|
else: |
|
print(f"❌ 场景文件不存在: {file_path}") |
|
except Exception as e: |
|
print(f"⚠️ 加载场景 {key} 失败: {e}") |
|
|
|
|
|
scenarios["🎧 默认示例"] = { |
|
"text": ( |
|
"[S1]大家好,欢迎收听今天的节目,我是主播小雨。" |
|
"[S2]大家好,我是嘉宾阿明,很高兴和大家见面。" |
|
"[S1]今天我们要聊的话题非常有趣,相信大家会喜欢的。" |
|
"[S2]是的,让我们开始今天的精彩内容吧!" |
|
), |
|
"description": "默认的示例对话,适合快速体验", |
|
"audio1": DEFAULT_AUDIO_CONFIG["speaker1"]["audio"], |
|
"text1": DEFAULT_AUDIO_CONFIG["speaker1"]["text"], |
|
"audio2": DEFAULT_AUDIO_CONFIG["speaker2"]["audio"], |
|
"text2": DEFAULT_AUDIO_CONFIG["speaker2"]["text"], |
|
"base_path": "", |
|
} |
|
|
|
print(f"📊 总共加载了 {len(scenarios)} 个场景") |
|
return scenarios |
|
|
|
|
|
def load_scenario_data(scenario_key: str): |
|
"""加载场景数据,确保音频和文本一一对应""" |
|
if scenario_key not in SCENARIO_CONFIG: |
|
return None, None, None, None, None |
|
|
|
try: |
|
scenario_file = SCENARIO_CONFIG[scenario_key]["file"] |
|
if not os.path.exists(scenario_file): |
|
return None, None, None, None, None |
|
|
|
with open(scenario_file, "r", encoding="utf-8") as f: |
|
data = json.load(f) |
|
|
|
|
|
audio1_path = data.get("prompt_audio_speaker1", "") |
|
audio2_path = data.get("prompt_audio_speaker2", "") |
|
|
|
if audio1_path and not audio1_path.startswith("/"): |
|
audio1_path = os.path.join(data.get("base_path", ""), audio1_path) |
|
if audio2_path and not audio2_path.startswith("/"): |
|
audio2_path = os.path.join(data.get("base_path", ""), audio2_path) |
|
|
|
return ( |
|
data.get("text", ""), |
|
audio1_path if os.path.exists(audio1_path) else None, |
|
data.get("prompt_text_speaker1", ""), |
|
audio2_path if os.path.exists(audio2_path) else None, |
|
data.get("prompt_text_speaker2", "") |
|
) |
|
except Exception as e: |
|
print(f"❌ 加载场景失败: {e}") |
|
return None, None, None, None, None |
|
|
|
|
|
def load_default_audio(): |
|
"""加载默认音频和文本,确保音频文件存在""" |
|
audio1 = DEFAULT_AUDIO_CONFIG["speaker1"]["audio"] |
|
text1 = DEFAULT_AUDIO_CONFIG["speaker1"]["text"] |
|
audio2 = DEFAULT_AUDIO_CONFIG["speaker2"]["audio"] |
|
text2 = DEFAULT_AUDIO_CONFIG["speaker2"]["text"] |
|
|
|
|
|
default_text = ( |
|
"[S1]大家好,欢迎收听今天的节目,我是主播小雨。" |
|
"[S2]大家好,我是嘉宾阿明,很高兴和大家见面。" |
|
"[S1]今天我们要聊的话题非常有趣,相信大家会喜欢的。" |
|
"[S2]是的,让我们开始今天的精彩内容吧!" |
|
) |
|
|
|
|
|
audio1_exists = os.path.exists(audio1) |
|
audio2_exists = os.path.exists(audio2) |
|
|
|
print(f"🔍 默认音频检查: {audio1}={audio1_exists}, {audio2}={audio2_exists}") |
|
|
|
|
|
audio1_path = os.path.abspath(audio1) if audio1_exists else None |
|
audio2_path = os.path.abspath(audio2) if audio2_exists else None |
|
|
|
print(f"🎵 返回音频路径: audio1={audio1_path}, audio2={audio2_path}") |
|
|
|
return ( |
|
default_text, |
|
audio1_path, |
|
text1, |
|
audio2_path, |
|
text2 |
|
) |
|
|
|
|
|
def initialize_model(): |
|
global tokenizer, model, spt, device |
|
if tokenizer is not None: |
|
return tokenizer, model, spt, device |
|
|
|
device = "cuda" if torch.cuda.is_available() else "cpu" |
|
print(f"🔧 初始化模型,设备: {device}") |
|
|
|
if not os.path.exists(SPT_CHECKPOINT_PATH): |
|
raise FileNotFoundError( |
|
"未找到 XY_Tokenizer 权重,请检查网络或手动放置到 XY_Tokenizer/weights/xy_tokenizer.ckpt" |
|
) |
|
|
|
tokenizer, model, spt = load_model( |
|
MODEL_PATH, |
|
SPT_CONFIG_PATH, |
|
SPT_CHECKPOINT_PATH, |
|
) |
|
model = model.to(device) |
|
spt = spt.to(device) |
|
|
|
|
|
try: |
|
|
|
model.generation_config.max_new_tokens = min( |
|
getattr(model.generation_config, "max_new_tokens", 1024), 1024 |
|
) |
|
|
|
|
|
model.generation_config.do_sample = True |
|
model.generation_config.temperature = 1.0 |
|
model.generation_config.top_k = 50 |
|
model.generation_config.top_p = 0.9 |
|
model.generation_config.repetition_penalty = 1.1 |
|
model.generation_config.num_beams = 1 |
|
|
|
|
|
model.generation_config.epsilon = 1e-8 |
|
model.generation_config.pad_token_id = model.config.eos_token_id |
|
|
|
print(f"🚀 应用稳定生成参数: temp={model.generation_config.temperature}, top_k={model.generation_config.top_k}, top_p={model.generation_config.top_p}") |
|
except Exception as e: |
|
print(f"⚠️ 生成参数设置失败: {e}") |
|
pass |
|
|
|
print("✅ 模型初始化完成!") |
|
return tokenizer, model, spt, device |
|
|
|
|
|
|
|
|
|
|
|
|
|
@spaces.GPU(duration=60) |
|
def generate_dialogue_audio( |
|
dialogue_text: str, |
|
speaker1_audio: Optional[str], |
|
speaker1_text: str, |
|
speaker2_audio: Optional[str], |
|
speaker2_text: str, |
|
use_normalize: bool, |
|
temperature: float = 1.0, |
|
top_k: int = 50, |
|
top_p: float = 0.9, |
|
repetition_penalty: float = 1.1, |
|
max_new_tokens: int = 2048, |
|
do_sample: bool = True, |
|
) -> Tuple[Optional[str], str]: |
|
try: |
|
if not dialogue_text or not dialogue_text.strip(): |
|
return None, "❌ 请输入对话文本" |
|
|
|
|
|
if not speaker1_audio and not speaker2_audio: |
|
return None, "💡 页面应该已自动加载默认音频,如未加载请点击 '🎧 默认音频' 按钮,或上传您自己的参考音频文件!" |
|
|
|
|
|
tokenizer, model, spt, device = initialize_model() |
|
|
|
|
|
print(f"🎛️ 应用用户参数: temp={temperature}, top_k={top_k}, top_p={top_p}, penalty={repetition_penalty}") |
|
model.generation_config.temperature = temperature |
|
model.generation_config.top_k = top_k |
|
model.generation_config.top_p = top_p |
|
model.generation_config.repetition_penalty = repetition_penalty |
|
model.generation_config.max_new_tokens = min(max_new_tokens, 4096) |
|
model.generation_config.do_sample = do_sample |
|
|
|
|
|
item = {"text": dialogue_text} |
|
if speaker1_audio and speaker2_audio: |
|
item.update( |
|
{ |
|
"prompt_audio_speaker1": speaker1_audio, |
|
"prompt_text_speaker1": speaker1_text or "", |
|
"prompt_audio_speaker2": speaker2_audio, |
|
"prompt_text_speaker2": speaker2_text or "", |
|
} |
|
) |
|
else: |
|
|
|
single_audio = speaker1_audio or speaker2_audio |
|
single_text = speaker1_text or speaker2_text or "" |
|
item.update({"prompt_audio": single_audio, "prompt_text": single_text}) |
|
|
|
|
|
try: |
|
actual_texts_data, audio_results = process_batch( |
|
batch_items=[item], |
|
tokenizer=tokenizer, |
|
model=model, |
|
spt=spt, |
|
device=device, |
|
system_prompt=SYSTEM_PROMPT, |
|
start_idx=0, |
|
use_normalize=use_normalize, |
|
) |
|
except RuntimeError as e: |
|
if "probability tensor contains" in str(e): |
|
print("⚠️ 检测到数值不稳定,尝试使用确定性生成...") |
|
|
|
original_do_sample = model.generation_config.do_sample |
|
model.generation_config.do_sample = False |
|
try: |
|
actual_texts_data, audio_results = process_batch( |
|
batch_items=[item], |
|
tokenizer=tokenizer, |
|
model=model, |
|
spt=spt, |
|
device=device, |
|
system_prompt=SYSTEM_PROMPT, |
|
start_idx=0, |
|
use_normalize=use_normalize, |
|
) |
|
finally: |
|
|
|
model.generation_config.do_sample = original_do_sample |
|
else: |
|
raise e |
|
|
|
if not audio_results or audio_results[0] is None: |
|
return None, "❌ 音频生成失败" |
|
|
|
audio_result = audio_results[0] |
|
out_path = tempfile.NamedTemporaryFile(suffix=".wav", delete=False).name |
|
torchaudio.save(out_path, audio_result["audio_data"], audio_result["sample_rate"]) |
|
|
|
status = ( |
|
f"✅ 生成成功!\n\n" |
|
f"📊 音频信息:\n" |
|
f"- 采样率: {audio_result['sample_rate']} Hz\n" |
|
f"- 时长: {audio_result['audio_data'].shape[-1] / audio_result['sample_rate']:.2f} 秒\n" |
|
f"- 通道数: {audio_result['audio_data'].shape[0]}\n\n" |
|
f"📝 文本处理:\n" |
|
f"- 是否规范化: {use_normalize}\n" |
|
) |
|
return out_path, status |
|
|
|
except Exception as e: |
|
import traceback |
|
return None, f"❌ 生成出错: {e}\n\n{traceback.format_exc()}" |
|
|
|
|
|
|
|
|
|
|
|
|
|
def create_space_ui() -> gr.Blocks: |
|
|
|
custom_css = """ |
|
/* 全局样式 - Apple暗色风格 */ |
|
.gradio-container { |
|
max-width: 1400px !important; |
|
margin: 0 auto !important; |
|
font-family: -apple-system, BlinkMacSystemFont, 'SF Pro Display', system-ui, sans-serif !important; |
|
background: #0d1117 !important; |
|
--primary-color: #0969da; |
|
--primary-hover: #1f6feb; |
|
--surface-color: #161b22; |
|
--surface-secondary: #21262d; |
|
--border-color: #30363d; |
|
--border-secondary: #21262d; |
|
--text-primary: #f0f6fc; |
|
--text-secondary: #8b949e; |
|
--text-muted: #656d76; |
|
--success-color: #238636; |
|
--shadow: 0 4px 16px rgba(0,0,0,0.4); |
|
--shadow-elevated: 0 8px 32px rgba(0,0,0,0.6); |
|
--radius: 12px; |
|
} |
|
/* 主标题区域 */ |
|
.header { |
|
text-align: center; |
|
margin-bottom: 2rem; |
|
background: var(--surface-color); |
|
padding: 3rem 2rem; |
|
border-radius: var(--radius); |
|
color: var(--text-primary); |
|
box-shadow: var(--shadow); |
|
border: 1px solid var(--border-color); |
|
} |
|
.header h1 { |
|
font-size: 2.75rem; |
|
margin: 0 0 0.5rem 0; |
|
font-weight: 700; |
|
letter-spacing: -0.02em; |
|
color: var(--text-primary); |
|
} |
|
.header p { |
|
font-size: 1.1rem; |
|
margin: 0; |
|
color: var(--text-secondary); |
|
font-weight: 400; |
|
} |
|
/* 卡片组件 - 暗色主题 */ |
|
.section { |
|
background: var(--surface-color); |
|
border-radius: var(--radius); |
|
padding: 2rem; |
|
border: 1px solid var(--border-color); |
|
margin: 1rem 0; |
|
box-shadow: var(--shadow); |
|
transition: all 0.2s ease; |
|
} |
|
.section:hover { |
|
box-shadow: var(--shadow-elevated); |
|
transform: translateY(-2px); |
|
border-color: var(--primary-color); |
|
} |
|
/* 按钮样式 - 暗色主题 */ |
|
.quick-btn { |
|
background: var(--primary-color) !important; |
|
border: none !important; |
|
color: var(--text-primary) !important; |
|
font-weight: 600 !important; |
|
border-radius: var(--radius) !important; |
|
padding: 0.875rem 2rem !important; |
|
transition: all 0.2s ease !important; |
|
} |
|
.quick-btn:hover { |
|
background: var(--primary-hover) !important; |
|
transform: translateY(-1px) !important; |
|
box-shadow: 0 8px 24px rgba(9,105,218,0.4) !important; |
|
} |
|
.generate-btn { |
|
background: var(--primary-color) !important; |
|
border: none !important; |
|
color: var(--text-primary) !important; |
|
font-weight: 700 !important; |
|
font-size: 1.1rem !important; |
|
border-radius: var(--radius) !important; |
|
padding: 1rem 2rem !important; |
|
width: 100% !important; |
|
transition: all 0.2s ease !important; |
|
box-shadow: var(--shadow) !important; |
|
} |
|
.generate-btn:hover { |
|
background: var(--primary-hover) !important; |
|
transform: translateY(-2px) !important; |
|
box-shadow: var(--shadow-elevated) !important; |
|
} |
|
.speaker-section { |
|
background: var(--surface-secondary); |
|
padding: 1.5rem; |
|
border-radius: var(--radius); |
|
border: 1px solid var(--border-color); |
|
} |
|
|
|
/* Gradio 组件暗色主题覆盖 */ |
|
.gradio-container .gr-textbox, |
|
.gradio-container .gr-textarea, |
|
.gradio-container .gr-dropdown, |
|
.gradio-container .gr-audio, |
|
.gradio-container .gr-slider, |
|
.gradio-container .gr-checkbox, |
|
.gradio-container .gr-accordion { |
|
background: var(--surface-color) !important; |
|
border: 1px solid var(--border-color) !important; |
|
color: var(--text-primary) !important; |
|
border-radius: var(--radius) !important; |
|
} |
|
|
|
.gradio-container .gr-textbox:focus, |
|
.gradio-container .gr-textarea:focus, |
|
.gradio-container .gr-dropdown:focus { |
|
border-color: var(--primary-color) !important; |
|
box-shadow: 0 0 0 3px rgba(9,105,218,0.2) !important; |
|
} |
|
|
|
/* 文本和标签暗色主题 */ |
|
.gradio-container .gr-markdown, |
|
.gradio-container .gr-markdown *, |
|
.gradio-container label, |
|
.gradio-container p, |
|
.gradio-container span { |
|
color: var(--text-primary) !important; |
|
} |
|
|
|
.gradio-container .gr-markdown code { |
|
background: var(--surface-secondary) !important; |
|
color: var(--text-primary) !important; |
|
border-radius: 4px !important; |
|
padding: 2px 6px !important; |
|
} |
|
|
|
/* 按钮统一暗色主题 */ |
|
.gradio-container .gr-button { |
|
background: var(--surface-color) !important; |
|
border: 1px solid var(--border-color) !important; |
|
color: var(--text-primary) !important; |
|
border-radius: var(--radius) !important; |
|
} |
|
|
|
.gradio-container .gr-button:hover { |
|
background: var(--surface-secondary) !important; |
|
border-color: var(--primary-color) !important; |
|
} |
|
|
|
.gradio-container .gr-button.primary { |
|
background: var(--primary-color) !important; |
|
border: none !important; |
|
color: var(--text-primary) !important; |
|
} |
|
|
|
.gradio-container .gr-button.primary:hover { |
|
background: var(--primary-hover) !important; |
|
} |
|
""" |
|
|
|
with gr.Blocks(css=custom_css, title="🎙️ MOSS-TTSD | Hugging Face Space", theme="dark") as demo: |
|
gr.HTML( |
|
""" |
|
<div class="header"> |
|
<h1>🎙️ MOSS-TTSD 对话语音合成</h1> |
|
<p>零样本双说话者对话合成 · 默认中文界面 · 一键加载场景</p> |
|
</div> |
|
""" |
|
) |
|
|
|
with gr.Row(): |
|
|
|
with gr.Column(scale=3): |
|
with gr.Group(): |
|
gr.Markdown("### 📝 对话文本") |
|
|
|
|
|
default_text = ( |
|
"[S1]大家好,欢迎收听今天的节目,我是主播小雨。" |
|
"[S2]大家好,我是嘉宾阿明,很高兴和大家见面。" |
|
"[S1]今天我们要聊的话题非常有趣,相信大家会喜欢的。" |
|
"[S2]是的,让我们开始今天的精彩内容吧!" |
|
) |
|
|
|
dialogue_text = gr.TextArea( |
|
label="", |
|
lines=6, |
|
placeholder="请输入对话内容,使用[S1]/[S2]标记不同说话者...", |
|
value=default_text, |
|
) |
|
|
|
with gr.Group(): |
|
gr.Markdown("### 🚀 快速操作") |
|
|
|
|
|
predefined_scenarios = [ |
|
"🎧 默认示例", |
|
"🤖 科技播客 - AI发展趋势", |
|
"📚 教育播客 - 高效学习方法", |
|
"🍜 生活播客 - 美食文化探索", |
|
"💼 商业播客 - 创业经验分享", |
|
"🏃 健康播客 - 运动健身指南", |
|
"🧠 心理播客 - 情绪管理技巧" |
|
] |
|
|
|
scenario_dropdown = gr.Dropdown( |
|
choices=predefined_scenarios, |
|
value=predefined_scenarios[0], |
|
label="🎭 选择场景", |
|
info="选择一个预设场景,获取不同主题的对话文本" |
|
) |
|
with gr.Row(): |
|
btn_load_scenario = gr.Button("📝 加载场景文本", variant="secondary") |
|
btn_load_default = gr.Button("🎧 加载默认音频", variant="secondary") |
|
|
|
with gr.Row(): |
|
with gr.Group(): |
|
gr.Markdown("### 🎵 说话者1 (女声)") |
|
|
|
try: |
|
default_audio1 = DEFAULT_AUDIO_CONFIG["speaker1"]["audio"] |
|
default_text1 = DEFAULT_AUDIO_CONFIG["speaker1"]["text"] |
|
if os.path.exists(default_audio1): |
|
speaker1_audio = gr.Audio( |
|
label="参考音频", |
|
type="filepath", |
|
value=default_audio1 |
|
) |
|
else: |
|
speaker1_audio = gr.Audio( |
|
label="参考音频", |
|
type="filepath" |
|
) |
|
speaker1_text = gr.TextArea( |
|
label="参考文本", |
|
lines=2, |
|
placeholder="请输入与参考音频内容完全匹配的文本...", |
|
value=default_text1 |
|
) |
|
except Exception as e: |
|
print(f"⚠️ 无法预设说话者1默认内容: {e}") |
|
speaker1_audio = gr.Audio( |
|
label="参考音频", |
|
type="filepath" |
|
) |
|
speaker1_text = gr.TextArea( |
|
label="参考文本", |
|
lines=2, |
|
placeholder="请输入与参考音频内容完全匹配的文本..." |
|
) |
|
with gr.Group(): |
|
gr.Markdown("### 🎵 说话者2 (男声)") |
|
|
|
try: |
|
default_audio2 = DEFAULT_AUDIO_CONFIG["speaker2"]["audio"] |
|
default_text2 = DEFAULT_AUDIO_CONFIG["speaker2"]["text"] |
|
if os.path.exists(default_audio2): |
|
speaker2_audio = gr.Audio( |
|
label="参考音频", |
|
type="filepath", |
|
value=default_audio2 |
|
) |
|
else: |
|
speaker2_audio = gr.Audio( |
|
label="参考音频", |
|
type="filepath" |
|
) |
|
speaker2_text = gr.TextArea( |
|
label="参考文本", |
|
lines=2, |
|
placeholder="请输入与参考音频内容完全匹配的文本...", |
|
value=default_text2 |
|
) |
|
except Exception as e: |
|
print(f"⚠️ 无法预设说话者2默认内容: {e}") |
|
speaker2_audio = gr.Audio( |
|
label="参考音频", |
|
type="filepath" |
|
) |
|
speaker2_text = gr.TextArea( |
|
label="参考文本", |
|
lines=2, |
|
placeholder="请输入与参考音频内容完全匹配的文本..." |
|
) |
|
|
|
with gr.Group(): |
|
gr.Markdown("### ⚙️ 基础设置") |
|
with gr.Row(): |
|
use_normalize = gr.Checkbox(label="✅ 文本标准化(推荐)", value=True) |
|
|
|
|
|
with gr.Accordion("🎛️ 高级参数设置", open=False): |
|
gr.Markdown("**🎯 生成风格控制** - 根据需要调整参数以获得不同的语音风格") |
|
|
|
|
|
with gr.Row(): |
|
style_preset = gr.Dropdown( |
|
label="🎨 预设风格", |
|
choices=["轻松对话", "新闻播报", "娱乐节目", "教育讲解", "自定义"], |
|
value="轻松对话", |
|
interactive=True |
|
) |
|
|
|
gr.Markdown("**⚙️ 自定义参数** - 微调生成效果") |
|
|
|
with gr.Row(): |
|
with gr.Column(): |
|
temperature = gr.Slider( |
|
minimum=0.5, |
|
maximum=1.5, |
|
value=1.0, |
|
step=0.1, |
|
label="🌡️ 语气温度", |
|
info="控制语气自然度 (0.5=稳定, 1.0=自然, 1.5=活泼)" |
|
) |
|
top_k = gr.Slider( |
|
minimum=20, |
|
maximum=100, |
|
value=50, |
|
step=10, |
|
label="🔝 词汇多样性", |
|
info="控制词汇选择范围" |
|
) |
|
with gr.Column(): |
|
top_p = gr.Slider( |
|
minimum=0.7, |
|
maximum=1.0, |
|
value=0.9, |
|
step=0.05, |
|
label="🎯 表达流畅度", |
|
info="控制表达的连贯性" |
|
) |
|
repetition_penalty = gr.Slider( |
|
minimum=1.0, |
|
maximum=1.3, |
|
value=1.1, |
|
step=0.05, |
|
label="🔄 重复避免", |
|
info="避免重复表达的强度" |
|
) |
|
|
|
with gr.Row(): |
|
max_new_tokens = gr.Slider( |
|
minimum=512, |
|
maximum=2048, |
|
value=1024, |
|
step=128, |
|
label="📏 最大生成长度", |
|
info="控制生成音频的长度 (512=快速, 1024=平衡, 2048=完整)" |
|
) |
|
do_sample = gr.Checkbox( |
|
label="🎲 启用采样", |
|
value=True, |
|
info="关闭后使用确定性生成,更稳定但缺乏变化" |
|
) |
|
|
|
btn_generate = gr.Button("🎬 开始合成", variant="primary", size="lg") |
|
gr.Markdown("💡 **开箱即用**: 页面已预填充默认内容,可直接合成 | **生成优化**: 预计20-40秒完成") |
|
|
|
|
|
with gr.Column(scale=2): |
|
with gr.Group(): |
|
gr.Markdown("### 🎧 生成结果") |
|
output_audio = gr.Audio(label="生成的音频", type="filepath") |
|
status_info = gr.TextArea(label="状态信息", lines=12, interactive=False) |
|
|
|
with gr.Group(): |
|
gr.Markdown("### 📚 使用说明") |
|
gr.Markdown( |
|
""" |
|
**🎯 快速开始:** |
|
1. 【文本】选择场景并点击"📝 加载场景文本",或自己输入对话文本 |
|
2. 【音频】点击"🎧 加载默认音频"使用示例音频,或上传自己的参考音频 |
|
3. 【参考文本】确保参考文本与音频内容完全匹配 |
|
4. 【设置】勾选"文本标准化",可选调整高级参数 |
|
5. 【生成】点击"🎬 开始合成" |
|
|
|
**📝 格式要求:** |
|
- 使用 `[S1]`/`[S2]` 标记不同说话者 |
|
- 参考文本需与参考音频内容完全匹配 |
|
- 支持上传两个参考音频(双说话者)或一个(单说话者) |
|
|
|
**🎵 音频建议:** |
|
- 格式: WAV, MP3, FLAC |
|
- 时长: 10-30秒最佳 |
|
- 质量: 清晰无背景噪音 |
|
- 语速: 自然正常语速 |
|
|
|
**💡 提示:** |
|
- 文本标准化开启可提升质量(数字、标点等处理更稳定) |
|
- 文本尽量短句、自然口语化 |
|
- 生成时间根据文本长度而定,请耐心等待 |
|
""" |
|
) |
|
|
|
|
|
def on_load_scenario(name: str): |
|
"""加载选中的场景 - 只更换对话文本,不影响音频""" |
|
if not name or name.strip() == "": |
|
gr.Warning("⚠️ 请先选择一个场景") |
|
return gr.update() |
|
|
|
scenarios = get_scenario_examples() |
|
if name not in scenarios: |
|
gr.Error(f"❌ 场景不存在: {name}") |
|
return gr.update() |
|
|
|
try: |
|
scenario = scenarios[name] |
|
dialogue_text = scenario.get("text", "") |
|
|
|
gr.Info(f"✅ 成功加载场景: {name} (仅更换对话文本)") |
|
return dialogue_text |
|
|
|
except Exception as e: |
|
gr.Error(f"❌ 加载场景时出错: {str(e)}") |
|
return gr.update() |
|
|
|
def on_load_default(): |
|
"""加载默认音频和文本""" |
|
try: |
|
result = load_default_audio() |
|
gr.Info("✅ 成功加载默认音频和文本") |
|
return result |
|
except Exception as e: |
|
gr.Error(f"❌ 加载默认音频时出错: {str(e)}") |
|
return gr.update(), gr.update(), gr.update(), gr.update(), gr.update() |
|
|
|
|
|
STYLE_PRESETS = { |
|
"轻松对话": {"temperature": 1.0, "top_k": 50, "top_p": 0.9, "repetition_penalty": 1.1}, |
|
"新闻播报": {"temperature": 0.8, "top_k": 30, "top_p": 0.85, "repetition_penalty": 1.05}, |
|
"娱乐节目": {"temperature": 1.2, "top_k": 80, "top_p": 0.95, "repetition_penalty": 1.15}, |
|
"教育讲解": {"temperature": 0.9, "top_k": 40, "top_p": 0.88, "repetition_penalty": 1.08}, |
|
"自定义": {"temperature": 1.0, "top_k": 50, "top_p": 0.9, "repetition_penalty": 1.1} |
|
} |
|
|
|
def on_style_preset_change(preset_name): |
|
"""当预设风格改变时,自动更新参数滑块""" |
|
if preset_name in STYLE_PRESETS: |
|
params = STYLE_PRESETS[preset_name] |
|
return ( |
|
gr.update(value=params["temperature"]), |
|
gr.update(value=params["top_k"]), |
|
gr.update(value=params["top_p"]), |
|
gr.update(value=params["repetition_penalty"]) |
|
) |
|
return gr.update(), gr.update(), gr.update(), gr.update() |
|
|
|
|
|
style_preset.change( |
|
fn=on_style_preset_change, |
|
inputs=[style_preset], |
|
outputs=[temperature, top_k, top_p, repetition_penalty] |
|
) |
|
|
|
btn_load_scenario.click( |
|
fn=on_load_scenario, |
|
inputs=[scenario_dropdown], |
|
outputs=[dialogue_text], |
|
) |
|
|
|
btn_load_default.click( |
|
fn=on_load_default, |
|
outputs=[dialogue_text, speaker1_audio, speaker1_text, speaker2_audio, speaker2_text], |
|
) |
|
|
|
btn_generate.click( |
|
fn=generate_dialogue_audio, |
|
inputs=[ |
|
dialogue_text, speaker1_audio, speaker1_text, speaker2_audio, speaker2_text, |
|
use_normalize, temperature, top_k, top_p, repetition_penalty, max_new_tokens, do_sample |
|
], |
|
outputs=[output_audio, status_info], |
|
show_progress=True, |
|
) |
|
|
|
return demo |
|
|
|
|
|
|
|
demo = create_space_ui() |
|
|
|
def main(): |
|
demo.queue(max_size=16).launch() |
|
|
|
|
|
if __name__ == "__main__": |
|
main() |