File size: 4,478 Bytes
1b6bcbc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
import re
import os
import json
from typing import List, Union
import librosa
import soundfile
import numpy as np

def save_json(path : str, data : Union[List[dict], dict]):
    with open(path, 'w', encoding="utf-8") as target:
        json.dump(data, path, ensure_ascii=False)


def load_json(path : str):
    with open(path, 'r', encoding="utf-8") as source:
        data = json.load(source)
        return data


def merge_audio_vads(source_path ,save_path, vad_list : List[List], interval = 1, sample_rate = None):
    data, sample_rate = librosa.load(source_path, sr=sample_rate, mono=True)
    audio_list = []
    for i, _ in enumerate(vad_list):
        time_start = _[0]
        time_end = _[1]
        start = int((time_start) * sample_rate)
        end = int((time_end) * sample_rate)
        if (i > 0):
            silence = np.zeros(int(sample_rate * interval))
            audio_list.append(silence)
        audio_list.append(data[start:end])
    audio_concat = np.concatenate(audio_list)
    os.makedirs(os.path.split(save_path)[0], exist_ok=True)
    soundfile.write(save_path, audio_concat, sample_rate)


def get_sub_dirs(source_dir):
    sub_dir = [f for f in os.listdir(source_dir) if not f.startswith('.')]
    sub_dir = [f for f in sub_dir if os.path.isdir(os.path.join(source_dir, f))]
    return sub_dir


def ends_with_ending_sentence(sentence):
    if re.search(r'[。?!…]$', sentence):
        return True
    return False


def ends_with_punctuation(sentence):
    pattern = r'[.,!?。,!?、・\uff00-\uffef\u3000-\u303f\u3040-\u309f\u30a0-\u30ff]$'
    return re.search(pattern, sentence)


def merge_audio_slice(source_audio, slice_dir, data_list, start_count, sample_rate, max_seconds, language, speaker_name) -> List:
    # input : datalist = [{'start': seconds, 'end': seconds, 'text': text}]
    # return : [{'sliced_audio_path', 'speaker_name', 'language', 'text'}] , count_next
    sentence_list = []
    audio_list = []
    time_length = 0
    count = start_count
    result = []

    data, sample_rate = librosa.load(source_audio, sr=sample_rate, mono=True)
    for sentence in data_list:
        text = sentence['text'].strip()
        if (text == ""):
            continue
        start = int((sentence['start']) * sample_rate)
        end = int((sentence['end']) * sample_rate)

        if time_length > 0 and time_length + (sentence['end'] - sentence['start']) > max_seconds:
            sliced_audio_name = f"{str(count).zfill(6)}"
            sliced_audio_path = os.path.join(slice_dir, sliced_audio_name+".wav")
            s_sentence = "".join(sentence_list)

            if language == "ZH" and re.search(r"[,]$", s_sentence):
                s_sentence = s_sentence[:-1] + '。'
            if language == "ZH" and not ends_with_punctuation(s_sentence):
                s_sentence = s_sentence + "。"

            audio_concat = np.concatenate(audio_list)
            if time_length > max_seconds:
                print(f"[too long voice]:{sliced_audio_path}, voice_length:{time_length} seconds")
            soundfile.write(sliced_audio_path, audio_concat, sample_rate)
            result.append(
                {
                    'sliced_audio_path' : sliced_audio_path,
                    'speaker_name' : speaker_name,
                    'language' : 'language',
                    'text' : s_sentence
                }
            )
            sentence_list = []
            audio_list = []
            time_length = 0
            count = count + 1

        sentence_list.append(text)
        audio_list.append(data[start:end])
        time_length = time_length + (sentence['end'] - sentence['start'])
        
        if ( ends_with_ending_sentence(text) ):
            sliced_audio_name = f"{str(count).zfill(6)}"
            sliced_audio_path = os.path.join(slice_dir, sliced_audio_name+".wav")
            s_sentence = "".join(sentence_list)
            audio_concat = np.concatenate(audio_list)
            soundfile.write(sliced_audio_path, audio_concat, sample_rate)
            
            result.append(
                {
                    'sliced_audio_path' : sliced_audio_path,
                    'speaker_name' : speaker_name,
                    'language' : 'language',
                    'text' : s_sentence
                }
            )
            sentence_list = []
            audio_list = []
            time_length = 0
            count = count + 1
    return result, count