import subprocess import random import os from pathlib import Path import librosa from scipy.io import wavfile import numpy as np import torch import csv import whisper import gradio as gr os.system("pip install --upgrade Cython==0.29.35") os.system("pip install pysptk --no-build-isolation") os.system("pip install kantts -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html") os.system("pip install tts-autolabel -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html") import sox def split_long_audio(model, filepaths, save_dir="data_dir", out_sr=44100): if isinstance(filepaths, str): filepaths = [filepaths] for file_idx, filepath in enumerate(filepaths): save_path = Path(save_dir) save_path.mkdir(exist_ok=True, parents=True) print(f"Transcribing file {file_idx}: '{filepath}' to segments...") result = model.transcribe(filepath, word_timestamps=True, task="transcribe", beam_size=5, best_of=5) segments = result['segments'] wav, sr = librosa.load(filepath, sr=None, offset=0, duration=None, mono=True) wav, _ = librosa.effects.trim(wav, top_db=20) peak = np.abs(wav).max() if peak > 1.0: wav = 0.98 * wav / peak wav2 = librosa.resample(wav, orig_sr=sr, target_sr=out_sr) wav2 /= max(wav2.max(), -wav2.min()) for i, seg in enumerate(segments): start_time = seg['start'] end_time = seg['end'] wav_seg = wav2[int(start_time * out_sr):int(end_time * out_sr)] wav_seg_name = f"{file_idx}_{i}.wav" out_fpath = save_path / wav_seg_name wavfile.write(out_fpath, rate=out_sr, data=(wav_seg * np.iinfo(np.int16).max).astype(np.int16)) whisper_size = "medium" whisper_model = whisper.load_model(whisper_size) from modelscope.tools import run_auto_label from modelscope.models.audio.tts import SambertHifigan from modelscope.pipelines import pipeline from modelscope.utils.constant import Tasks from modelscope.metainfo import Trainers from modelscope.trainers import build_trainer from modelscope.utils.audio.audio_utils import TtsTrainType pretrained_model_id = 'damo/speech_personal_sambert-hifigan_nsf_tts_zh-cn_pretrain_16k' dataset_id = "/home/user/app/output_training_data/" pretrain_work_dir = "/home/user/app/pretrain_work_dir/" def auto_label(Voicetoclone, VoiceMicrophone): if VoiceMicrophone is not None: audio = VoiceMicrophone else: audio = Voicetoclone try: split_long_audio(whisper_model, audio, "/home/user/app/test_wavs/") input_wav = "/home/user/app/test_wavs/" output_data = "/home/user/app/output_training_data/" ret, report = run_auto_label(input_wav=input_wav, work_dir=output_data, resource_revision="v1.0.7") except Exception: pass return "标注成功" def train(a): try: train_info = { TtsTrainType.TRAIN_TYPE_SAMBERT: { # 配置训练AM(sambert)模型 'train_steps': 52, # 训练多少个step 'save_interval_steps': 50, # 每训练多少个step保存一次checkpoint 'log_interval': 10 # 每训练多少个step打印一次训练日志 } } # 配置训练参数,指定数据集,临时工作目录和train_info kwargs = dict( model=pretrained_model_id, # 指定要finetune的模型 model_revision = "v1.0.6", work_dir=pretrain_work_dir, # 指定临时工作目录 train_dataset=dataset_id, # 指定数据集id train_type=train_info # 指定要训练类型及参数 ) trainer = build_trainer(Trainers.speech_kantts_trainer, default_args=kwargs) trainer.train() except Exception: pass return "训练完成" import random def infer(text): model_dir = "/home/user/app/pretrain_work_dir/" custom_infer_abs = { 'voice_name': 'F7', 'am_ckpt': os.path.join(model_dir, 'tmp_am', 'ckpt'), 'am_config': os.path.join(model_dir, 'tmp_am', 'config.yaml'), 'voc_ckpt': os.path.join(model_dir, 'orig_model', 'basemodel_16k', 'hifigan', 'ckpt'), 'voc_config': os.path.join(model_dir, 'orig_model', 'basemodel_16k', 'hifigan', 'config.yaml'), 'audio_config': os.path.join(model_dir, 'data', 'audio_config.yaml'), 'se_file': os.path.join(model_dir, 'data', 'se', 'se.npy') } kwargs = {'custom_ckpt': custom_infer_abs} model_id = SambertHifigan(os.path.join(model_dir, "orig_model"), **kwargs) inference = pipeline(task=Tasks.text_to_speech, model=model_id) output = inference(input=text) filename = str(random.randint(1, 1000000000000)) with open(filename + "myfile.wav", mode='bx') as f: f.write(output["output_wav"]) return filename + "myfile.wav" from textwrap import dedent app = gr.Blocks() with app: gr.Markdown("#
🥳🎶🎡 - Sambert中文声音克隆
") gr.Markdown("##
🌟 - 训练3分钟,推理5秒钟,中英真实拟声
") gr.Markdown("###
🌊 - 更多精彩应用,敬请关注[滔滔AI](http://www.talktalkai.com);滔滔AI,为爱滔滔!💕
") with gr.Row(): with gr.Column(): inp1 = gr.Audio(type="filepath", source="upload", label="方案一:请从本地上传一段语音") inp_micro = gr.Audio(type="filepath", source="microphone", label="方案二:请用麦克风录制您的声音") with gr.Column(): out1 = gr.Textbox(label="标注情况", lines=1, interactive=False) out2 = gr.Textbox(label="训练情况", lines=1, interactive=False) inp2 = gr.Textbox(label="请在这里填写您想合成的文本", placeholder="想说却还没说的 还很多...", lines=3) with gr.Column(): out3 = gr.Audio(type="filepath", label="为您合成的专属音频") with gr.Row(): btn1 = gr.Button("1.标注数据") btn2 = gr.Button("2.开始训练") btn3 = gr.Button("3.一键推理", variant="primary") btn1.click(auto_label, [inp1, inp_micro], out1) btn2.click(train, out1, out2) btn3.click(infer, inp2, out3) with gr.Accordion("📒 使用指南", open=True): _ = f""" 如何使用此程序: * 使用方案一或方案二,上传一分钟左右的语音后,依次点击“1.标注数据”、“2.开始训练”、“3.一键推理”即可开启声音克隆之旅 * 选择两个方案中的一个即可,程序会优先使用麦克风上传的语音;如果您需要从本地上传语音文件,请不要同时用方案二录制语音 * 您可以随时编辑想要合成的文本内容,但请不要生成会对个人以及组织造成侵害的内容 * 如果您需要用方案二录制您的声音,以下是一段长度合适的文本,供您朗读并录制: 我看到鸟儿飞到天空,它们飞得多快呀。明天它们再飞过同样的路线,也永远不是今天了。或许明天飞过这条路线的,不是老鸟,而是小鸟了。时间过得飞快,使我小心眼里不只是着急,还有悲伤。有一天我放学回家,看到太阳快落山了,就下决心说:“我要比太阳更快地回家。”我狂奔回去,站在庭院里喘气的时候,看到太阳还露着半边脸,我高兴地跳起来。那一天我跑赢了太阳。以后我常做这样的游戏,有时和太阳赛跑,有时和西北风比赛,有时一个暑假的作业,我十天就做完了。那时我三年级,常把哥哥五年级的作业拿来做。后来的二十年里,我因此受益无穷。虽然我知道人永远跑不过时间,但是可以比原来快跑几步。那几步虽然很小很小,但作用却很大很大。如果将来我有什么要教给我的孩子,我会告诉他:假若你一直和时间赛跑,你就可以成功。 """ gr.Markdown(dedent(_)) gr.Markdown("###
注意❗:请不要生成会对个人以及组织造成侵害的内容,此程序仅供科研、学习及个人娱乐使用。
") gr.HTML(''' ''') app.launch(show_error=True)