Spaces:
No application file
No application file
import re | |
import os | |
import json | |
from typing import List, Union | |
import librosa | |
import soundfile | |
import numpy as np | |
def save_json(path : str, data : Union[List[dict], dict]): | |
with open(path, 'w', encoding="utf-8") as target: | |
json.dump(data, path, ensure_ascii=False) | |
def load_json(path : str): | |
with open(path, 'r', encoding="utf-8") as source: | |
data = json.load(source) | |
return data | |
def merge_audio_vads(source_path ,save_path, vad_list : List[List], interval = 1, sample_rate = None): | |
data, sample_rate = librosa.load(source_path, sr=sample_rate, mono=True) | |
audio_list = [] | |
for i, _ in enumerate(vad_list): | |
time_start = _[0] | |
time_end = _[1] | |
start = int((time_start) * sample_rate) | |
end = int((time_end) * sample_rate) | |
if (i > 0): | |
silence = np.zeros(int(sample_rate * interval)) | |
audio_list.append(silence) | |
audio_list.append(data[start:end]) | |
audio_concat = np.concatenate(audio_list) | |
os.makedirs(os.path.split(save_path)[0], exist_ok=True) | |
soundfile.write(save_path, audio_concat, sample_rate) | |
def get_sub_dirs(source_dir): | |
sub_dir = [f for f in os.listdir(source_dir) if not f.startswith('.')] | |
sub_dir = [f for f in sub_dir if os.path.isdir(os.path.join(source_dir, f))] | |
return sub_dir | |
def ends_with_ending_sentence(sentence): | |
if re.search(r'[。?!…]$', sentence): | |
return True | |
return False | |
def ends_with_punctuation(sentence): | |
pattern = r'[.,!?。,!?、・\uff00-\uffef\u3000-\u303f\u3040-\u309f\u30a0-\u30ff]$' | |
return re.search(pattern, sentence) | |
def merge_audio_slice(source_audio, slice_dir, data_list, start_count, sample_rate, max_seconds, language, speaker_name) -> List: | |
# input : datalist = [{'start': seconds, 'end': seconds, 'text': text}] | |
# return : [{'sliced_audio_path', 'speaker_name', 'language', 'text'}] , count_next | |
sentence_list = [] | |
audio_list = [] | |
time_length = 0 | |
count = start_count | |
result = [] | |
data, sample_rate = librosa.load(source_audio, sr=sample_rate, mono=True) | |
for sentence in data_list: | |
text = sentence['text'].strip() | |
if (text == ""): | |
continue | |
start = int((sentence['start']) * sample_rate) | |
end = int((sentence['end']) * sample_rate) | |
if time_length > 0 and time_length + (sentence['end'] - sentence['start']) > max_seconds: | |
sliced_audio_name = f"{str(count).zfill(6)}" | |
sliced_audio_path = os.path.join(slice_dir, sliced_audio_name+".wav") | |
s_sentence = "".join(sentence_list) | |
if language == "ZH" and re.search(r"[,]$", s_sentence): | |
s_sentence = s_sentence[:-1] + '。' | |
if language == "ZH" and not ends_with_punctuation(s_sentence): | |
s_sentence = s_sentence + "。" | |
audio_concat = np.concatenate(audio_list) | |
if time_length > max_seconds: | |
print(f"[too long voice]:{sliced_audio_path}, voice_length:{time_length} seconds") | |
soundfile.write(sliced_audio_path, audio_concat, sample_rate) | |
result.append( | |
{ | |
'sliced_audio_path' : sliced_audio_path, | |
'speaker_name' : speaker_name, | |
'language' : 'language', | |
'text' : s_sentence | |
} | |
) | |
sentence_list = [] | |
audio_list = [] | |
time_length = 0 | |
count = count + 1 | |
sentence_list.append(text) | |
audio_list.append(data[start:end]) | |
time_length = time_length + (sentence['end'] - sentence['start']) | |
if ( ends_with_ending_sentence(text) ): | |
sliced_audio_name = f"{str(count).zfill(6)}" | |
sliced_audio_path = os.path.join(slice_dir, sliced_audio_name+".wav") | |
s_sentence = "".join(sentence_list) | |
audio_concat = np.concatenate(audio_list) | |
soundfile.write(sliced_audio_path, audio_concat, sample_rate) | |
result.append( | |
{ | |
'sliced_audio_path' : sliced_audio_path, | |
'speaker_name' : speaker_name, | |
'language' : 'language', | |
'text' : s_sentence | |
} | |
) | |
sentence_list = [] | |
audio_list = [] | |
time_length = 0 | |
count = count + 1 | |
return result, count |