# Copyright (c) 2025 MediaTek Reserch Inc (authors: Chan-Jan Hsu)
# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Liu Yue)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import spaces

import os
import sys
ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
sys.path.append('{}/third_party/Matcha-TTS'.format(ROOT_DIR))

import argparse
import gradio as gr
import numpy as np
import torch
import torchaudio
import random
import librosa
from transformers import pipeline
import subprocess
from scipy.signal import resample

import logging
logging.getLogger('matplotlib').setLevel(logging.WARNING)

from cosyvoice.cli.cosyvoice import CosyVoice
from cosyvoice.utils.file_utils import load_wav, speed_change

#logging.basicConfig(level=logging.DEBUG,
#                    format='%(asctime)s %(levelname)s %(message)s')

def generate_seed():
    seed = random.randint(1, 100000000)
    return {
        "__type__": "update",
        "value": seed
    }

def set_all_random_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

max_val = 0.8
def postprocess(speech, top_db=60, hop_length=220, win_length=440):
    speech, _ = librosa.effects.trim(
        speech, top_db=top_db,
        frame_length=win_length,
        hop_length=hop_length
    )
    if speech.abs().max() > max_val:
        speech = speech / speech.abs().max() * max_val
    speech = torch.concat([speech, torch.zeros(1, int(target_sr * 0.2))], dim=1)
    return speech

@spaces.GPU
def generate_audio(tts_text, prompt_text, prompt_wav, seed):
    # if instruct mode, please make sure that model is iic/CosyVoice-300M-Instruct and not cross_lingual mode

    prompt_speech_16k = postprocess(load_wav(prompt_wav, prompt_sr))
    set_all_random_seed(seed)
    output = cosyvoice.inference_zero_shot(tts_text, prompt_text, prompt_speech_16k)
    speed_factor = 1
    if speed_factor != 1.0:
        #try:
            #audio_data, sample_rate = speed_change(output["tts_speech"], target_sr, str(speed_factor))
            #audio_data = audio_data.numpy().flatten()
        new_length = int(len(output['tts_speech']) / speed_factor)
        audio_data = resample(output['tts_speech'], new_length)
        # except Exception as e:
        #     print(f"Failed to change speed of audio: \n{e}")
    else:
        audio_data = output['tts_speech'].numpy().flatten()

    return (target_sr, audio_data)


@spaces.GPU
def generate_text(prompt_wav):
    if prompt_wav:
        results = asr_pipeline(prompt_wav)
        return results['text']
    return "No valid input detected."

def main():
    with gr.Blocks(title="BreezyVoice 語音合成系統", theme="default") as demo:
        gr.Markdown(
            """# BreezyVoice 語音合成系統
            
            #### Runs on Huggingface Zero GPU (H200)

            為了加快推理速度，g2pw 注音標註並未被啟動。"""
        )

        # All content arranged in a single column
        with gr.Column():
            # Configuration Section

            # Grouping prompt audio inputs and auto speech recognition in one block using Markdown
            gr.Markdown("### 步驟 1. 音訊樣本輸入 & 音訊樣本文本輸入")
            gr.Markdown("選擇 prompt 音訊檔案或錄製 prompt 音訊 (5~15秒)，並手動校對自動產生的音訊樣本文本。")
            prompt_wav = gr.Audio(
                type='filepath',
                label='選擇 prompt 音訊檔案（確保取樣率不低於 16khz）或錄製 prompt 音訊'
            )

            with gr.Blocks():
                prompt_text = gr.Textbox(
                    label="音訊樣本文本輸入(此欄位應與音檔內容完全相同)",
                    lines=2,
                    placeholder="音訊樣本文本"
                )

            prompt_wav.input(
                fn=generate_text,
                inputs=[prompt_wav],
                outputs=prompt_text
            )

            gr.Examples(
                examples=[
                    ["examples/commonvoice-example-1.mp3", "明月幾時有，去問氣象局"],
                    ["examples/commonvoice-example-2.mp3", "雲林縣斗六市與林內鄉交界"],
                    ["examples/commonvoice-example-3.mp3", "法律應保障所有的人獲得相同的發展結果"]
                ],
                inputs=[prompt_wav, prompt_text],
                label="範例"
            )

            # Input Section: Synthesis Text

            gr.Markdown("### 步驟 2.合成文本輸入")
            tts_text = gr.Textbox(
                label="輸入想要合成的文本",
                lines=2,
                placeholder="請輸入想要合成的文本...",
                value="我今天忙了一整天，現在好想睡覺喔 QQ"
            )


            # Output Section
            gr.Markdown("### 步驟 3. 合成音訊")
            # Generation button for audio synthesis (triggered manually)

            with gr.Accordion("進階設定", open=False):
                seed = gr.Number(value=0, label="隨機推理種子")
                #seed_button = gr.Button("隨機")
                seed_button = gr.Button(value="\U0001F3B2生成隨機推理種子\U0001F3B2")
                speed_factor = 1
                # speed_factor = gr.Slider(
                #     minimum=0.25,
                #     maximum=4,
                #     step=0.05,
                #     label="語速",
                #     value=1.0,
                #     interactive=True
                # )

            generate_button = gr.Button("生成音訊")
            audio_output = gr.Audio(label="合成音訊")

            # Set up callbacks for seed generation and audio synthesis
            seed_button.click(fn=generate_seed, inputs=[], outputs=seed)
            generate_button.click(
                fn=generate_audio,
                inputs=[tts_text, prompt_text, prompt_wav, seed],
                outputs=audio_output
            )

        demo.launch()

if __name__ == '__main__':
    cosyvoice = CosyVoice('Splend1dchan/BreezyVoice')
    asr_pipeline = pipeline(
        "automatic-speech-recognition",
        model="openai/whisper-tiny",
        tokenizer="openai/whisper-tiny",
        device=0  # Use GPU (if available); set to -1 for CPU
    )
    sft_spk = cosyvoice.list_avaliable_spks()
    prompt_sr, target_sr = 16000, 22050
    default_data = np.zeros(target_sr)
    main()