Spaces:

kevinwang676
/

GPT-SoVITS-models

No application file

File size: 4,478 Bytes

1b6bcbc

import re
import os
import json
from typing import List, Union
import librosa
import soundfile
import numpy as np

def save_json(path : str, data : Union[List[dict], dict]):
    with open(path, 'w', encoding="utf-8") as target:
        json.dump(data, path, ensure_ascii=False)


def load_json(path : str):
    with open(path, 'r', encoding="utf-8") as source:
        data = json.load(source)
        return data


def merge_audio_vads(source_path ,save_path, vad_list : List[List], interval = 1, sample_rate = None):
    data, sample_rate = librosa.load(source_path, sr=sample_rate, mono=True)
    audio_list = []
    for i, _ in enumerate(vad_list):
        time_start = _[0]
        time_end = _[1]
        start = int((time_start) * sample_rate)
        end = int((time_end) * sample_rate)
        if (i > 0):
            silence = np.zeros(int(sample_rate * interval))
            audio_list.append(silence)
        audio_list.append(data[start:end])
    audio_concat = np.concatenate(audio_list)
    os.makedirs(os.path.split(save_path)[0], exist_ok=True)
    soundfile.write(save_path, audio_concat, sample_rate)


def get_sub_dirs(source_dir):
    sub_dir = [f for f in os.listdir(source_dir) if not f.startswith('.')]
    sub_dir = [f for f in sub_dir if os.path.isdir(os.path.join(source_dir, f))]
    return sub_dir


def ends_with_ending_sentence(sentence):
    if re.search(r'[。？！…]$', sentence):
        return True
    return False


def ends_with_punctuation(sentence):
    pattern = r'[.,!?。，！？、・\uff00-\uffef\u3000-\u303f\u3040-\u309f\u30a0-\u30ff]$'
    return re.search(pattern, sentence)


def merge_audio_slice(source_audio, slice_dir, data_list, start_count, sample_rate, max_seconds, language, speaker_name) -> List:
    # input : datalist = [{'start': seconds, 'end': seconds, 'text': text}]
    # return : [{'sliced_audio_path', 'speaker_name', 'language', 'text'}] , count_next
    sentence_list = []
    audio_list = []
    time_length = 0
    count = start_count
    result = []

    data, sample_rate = librosa.load(source_audio, sr=sample_rate, mono=True)
    for sentence in data_list:
        text = sentence['text'].strip()
        if (text == ""):
            continue
        start = int((sentence['start']) * sample_rate)
        end = int((sentence['end']) * sample_rate)

        if time_length > 0 and time_length + (sentence['end'] - sentence['start']) > max_seconds:
            sliced_audio_name = f"{str(count).zfill(6)}"
            sliced_audio_path = os.path.join(slice_dir, sliced_audio_name+".wav")
            s_sentence = "".join(sentence_list)

            if language == "ZH" and re.search(r"[，]$", s_sentence):
                s_sentence = s_sentence[:-1] + '。'
            if language == "ZH" and not ends_with_punctuation(s_sentence):
                s_sentence = s_sentence + "。"

            audio_concat = np.concatenate(audio_list)
            if time_length > max_seconds:
                print(f"[too long voice]:{sliced_audio_path}, voice_length:{time_length} seconds")
            soundfile.write(sliced_audio_path, audio_concat, sample_rate)
            result.append(
                {
                    'sliced_audio_path' : sliced_audio_path,
                    'speaker_name' : speaker_name,
                    'language' : 'language',
                    'text' : s_sentence
                }
            )
            sentence_list = []
            audio_list = []
            time_length = 0
            count = count + 1

        sentence_list.append(text)
        audio_list.append(data[start:end])
        time_length = time_length + (sentence['end'] - sentence['start'])
        
        if ( ends_with_ending_sentence(text) ):
            sliced_audio_name = f"{str(count).zfill(6)}"
            sliced_audio_path = os.path.join(slice_dir, sliced_audio_name+".wav")
            s_sentence = "".join(sentence_list)
            audio_concat = np.concatenate(audio_list)
            soundfile.write(sliced_audio_path, audio_concat, sample_rate)
            
            result.append(
                {
                    'sliced_audio_path' : sliced_audio_path,
                    'speaker_name' : speaker_name,
                    'language' : 'language',
                    'text' : s_sentence
                }
            )
            sentence_list = []
            audio_list = []
            time_length = 0
            count = count + 1
    return result, count