# Copyright (c) 2025 MediaTek Reserch Inc (authors: Chan-Jan Hsu) # Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Liu Yue) # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import spaces import os import sys ROOT_DIR = os.path.dirname(os.path.abspath(__file__)) sys.path.append('{}/third_party/Matcha-TTS'.format(ROOT_DIR)) import argparse import gradio as gr import numpy as np import torch import torchaudio import random import librosa from transformers import pipeline import subprocess from scipy.signal import resample import logging logging.getLogger('matplotlib').setLevel(logging.WARNING) from cosyvoice.cli.cosyvoice import CosyVoice from cosyvoice.utils.file_utils import load_wav, speed_change #logging.basicConfig(level=logging.DEBUG, # format='%(asctime)s %(levelname)s %(message)s') def generate_seed(): seed = random.randint(1, 100000000) return { "__type__": "update", "value": seed } def set_all_random_seed(seed): random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) torch.cuda.manual_seed_all(seed) max_val = 0.8 def postprocess(speech, top_db=60, hop_length=220, win_length=440): speech, _ = librosa.effects.trim( speech, top_db=top_db, frame_length=win_length, hop_length=hop_length ) if speech.abs().max() > max_val: speech = speech / speech.abs().max() * max_val speech = torch.concat([speech, torch.zeros(1, int(target_sr * 0.2))], dim=1) return speech @spaces.GPU def generate_audio(tts_text, prompt_text, prompt_wav, seed): # if instruct mode, please make sure that model is iic/CosyVoice-300M-Instruct and not cross_lingual mode prompt_speech_16k = postprocess(load_wav(prompt_wav, prompt_sr)) set_all_random_seed(seed) output = cosyvoice.inference_zero_shot(tts_text, prompt_text, prompt_speech_16k) speed_factor = 1 if speed_factor != 1.0: #try: #audio_data, sample_rate = speed_change(output["tts_speech"], target_sr, str(speed_factor)) #audio_data = audio_data.numpy().flatten() new_length = int(len(output['tts_speech']) / speed_factor) audio_data = resample(output['tts_speech'], new_length) # except Exception as e: # print(f"Failed to change speed of audio: \n{e}") else: audio_data = output['tts_speech'].numpy().flatten() return (target_sr, audio_data) @spaces.GPU def generate_text(prompt_wav): if prompt_wav: results = asr_pipeline(prompt_wav) return results['text'] return "No valid input detected." def main(): with gr.Blocks(title="BreezyVoice 語音合成系統", theme="default") as demo: gr.Markdown( """# BreezyVoice 語音合成系統 #### Runs on Huggingface Zero GPU (H200) 為了加快推理速度,g2pw 注音標註並未被啟動。""" ) # All content arranged in a single column with gr.Column(): # Configuration Section # Grouping prompt audio inputs and auto speech recognition in one block using Markdown gr.Markdown("### 步驟 1. 音訊樣本輸入 & 音訊樣本文本輸入") gr.Markdown("選擇 prompt 音訊檔案或錄製 prompt 音訊 (5~15秒),並手動校對自動產生的音訊樣本文本。") prompt_wav = gr.Audio( type='filepath', label='選擇 prompt 音訊檔案(確保取樣率不低於 16khz)或錄製 prompt 音訊' ) with gr.Blocks(): prompt_text = gr.Textbox( label="音訊樣本文本輸入(此欄位應與音檔內容完全相同)", lines=2, placeholder="音訊樣本文本" ) prompt_wav.input( fn=generate_text, inputs=[prompt_wav], outputs=prompt_text ) gr.Examples( examples=[ ["examples/commonvoice-example-1.mp3", "明月幾時有,去問氣象局"], ["examples/commonvoice-example-2.mp3", "雲林縣斗六市與林內鄉交界"], ["examples/commonvoice-example-3.mp3", "法律應保障所有的人獲得相同的發展結果"] ], inputs=[prompt_wav, prompt_text], label="範例" ) # Input Section: Synthesis Text gr.Markdown("### 步驟 2.合成文本輸入") tts_text = gr.Textbox( label="輸入想要合成的文本", lines=2, placeholder="請輸入想要合成的文本...", value="我今天忙了一整天,現在好想睡覺喔 QQ" ) # Output Section gr.Markdown("### 步驟 3. 合成音訊") # Generation button for audio synthesis (triggered manually) with gr.Accordion("進階設定", open=False): seed = gr.Number(value=0, label="隨機推理種子") #seed_button = gr.Button("隨機") seed_button = gr.Button(value="\U0001F3B2生成隨機推理種子\U0001F3B2") speed_factor = 1 # speed_factor = gr.Slider( # minimum=0.25, # maximum=4, # step=0.05, # label="語速", # value=1.0, # interactive=True # ) generate_button = gr.Button("生成音訊") audio_output = gr.Audio(label="合成音訊") # Set up callbacks for seed generation and audio synthesis seed_button.click(fn=generate_seed, inputs=[], outputs=seed) generate_button.click( fn=generate_audio, inputs=[tts_text, prompt_text, prompt_wav, seed], outputs=audio_output ) demo.launch() if __name__ == '__main__': cosyvoice = CosyVoice('Splend1dchan/BreezyVoice') asr_pipeline = pipeline( "automatic-speech-recognition", model="openai/whisper-tiny", tokenizer="openai/whisper-tiny", device=0 # Use GPU (if available); set to -1 for CPU ) sft_spk = cosyvoice.list_avaliable_spks() prompt_sr, target_sr = 16000, 22050 default_data = np.zeros(target_sr) main()