kevinwang676's picture
Upload folder using huggingface_hub
1b6bcbc verified
import re
import os
import json
from typing import List, Union
import librosa
import soundfile
import numpy as np
def save_json(path : str, data : Union[List[dict], dict]):
with open(path, 'w', encoding="utf-8") as target:
json.dump(data, path, ensure_ascii=False)
def load_json(path : str):
with open(path, 'r', encoding="utf-8") as source:
data = json.load(source)
return data
def merge_audio_vads(source_path ,save_path, vad_list : List[List], interval = 1, sample_rate = None):
data, sample_rate = librosa.load(source_path, sr=sample_rate, mono=True)
audio_list = []
for i, _ in enumerate(vad_list):
time_start = _[0]
time_end = _[1]
start = int((time_start) * sample_rate)
end = int((time_end) * sample_rate)
if (i > 0):
silence = np.zeros(int(sample_rate * interval))
audio_list.append(silence)
audio_list.append(data[start:end])
audio_concat = np.concatenate(audio_list)
os.makedirs(os.path.split(save_path)[0], exist_ok=True)
soundfile.write(save_path, audio_concat, sample_rate)
def get_sub_dirs(source_dir):
sub_dir = [f for f in os.listdir(source_dir) if not f.startswith('.')]
sub_dir = [f for f in sub_dir if os.path.isdir(os.path.join(source_dir, f))]
return sub_dir
def ends_with_ending_sentence(sentence):
if re.search(r'[。?!…]$', sentence):
return True
return False
def ends_with_punctuation(sentence):
pattern = r'[.,!?。,!?、・\uff00-\uffef\u3000-\u303f\u3040-\u309f\u30a0-\u30ff]$'
return re.search(pattern, sentence)
def merge_audio_slice(source_audio, slice_dir, data_list, start_count, sample_rate, max_seconds, language, speaker_name) -> List:
# input : datalist = [{'start': seconds, 'end': seconds, 'text': text}]
# return : [{'sliced_audio_path', 'speaker_name', 'language', 'text'}] , count_next
sentence_list = []
audio_list = []
time_length = 0
count = start_count
result = []
data, sample_rate = librosa.load(source_audio, sr=sample_rate, mono=True)
for sentence in data_list:
text = sentence['text'].strip()
if (text == ""):
continue
start = int((sentence['start']) * sample_rate)
end = int((sentence['end']) * sample_rate)
if time_length > 0 and time_length + (sentence['end'] - sentence['start']) > max_seconds:
sliced_audio_name = f"{str(count).zfill(6)}"
sliced_audio_path = os.path.join(slice_dir, sliced_audio_name+".wav")
s_sentence = "".join(sentence_list)
if language == "ZH" and re.search(r"[,]$", s_sentence):
s_sentence = s_sentence[:-1] + '。'
if language == "ZH" and not ends_with_punctuation(s_sentence):
s_sentence = s_sentence + "。"
audio_concat = np.concatenate(audio_list)
if time_length > max_seconds:
print(f"[too long voice]:{sliced_audio_path}, voice_length:{time_length} seconds")
soundfile.write(sliced_audio_path, audio_concat, sample_rate)
result.append(
{
'sliced_audio_path' : sliced_audio_path,
'speaker_name' : speaker_name,
'language' : 'language',
'text' : s_sentence
}
)
sentence_list = []
audio_list = []
time_length = 0
count = count + 1
sentence_list.append(text)
audio_list.append(data[start:end])
time_length = time_length + (sentence['end'] - sentence['start'])
if ( ends_with_ending_sentence(text) ):
sliced_audio_name = f"{str(count).zfill(6)}"
sliced_audio_path = os.path.join(slice_dir, sliced_audio_name+".wav")
s_sentence = "".join(sentence_list)
audio_concat = np.concatenate(audio_list)
soundfile.write(sliced_audio_path, audio_concat, sample_rate)
result.append(
{
'sliced_audio_path' : sliced_audio_path,
'speaker_name' : speaker_name,
'language' : 'language',
'text' : s_sentence
}
)
sentence_list = []
audio_list = []
time_length = 0
count = count + 1
return result, count