|
import logging |
|
import os |
|
import pathlib |
|
import time |
|
import tempfile |
|
import platform |
|
import gc |
|
if platform.system().lower() == 'windows': |
|
temp = pathlib.PosixPath |
|
pathlib.PosixPath = pathlib.WindowsPath |
|
elif platform.system().lower() == 'linux': |
|
temp = pathlib.WindowsPath |
|
pathlib.WindowsPath = pathlib.PosixPath |
|
os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python" |
|
|
|
import langid |
|
langid.set_languages(['en', 'zh', 'ja']) |
|
|
|
import torch |
|
import torchaudio |
|
|
|
import numpy as np |
|
|
|
from data.tokenizer import ( |
|
AudioTokenizer, |
|
tokenize_audio, |
|
) |
|
from data.collation import get_text_token_collater |
|
from models.vallex import VALLE |
|
from utils.g2p import PhonemeBpeTokenizer |
|
from descriptions import * |
|
from macros import * |
|
from examples import * |
|
|
|
import gradio as gr |
|
from vocos import Vocos |
|
from transformers import WhisperProcessor, WhisperForConditionalGeneration |
|
|
|
|
|
|
|
lang2token = {"en": "<en>", "ja": "<ja>", "zh": "<zh>"} |
|
lang2code = {"en": 0, "ja": 1, "zh": 2} |
|
langid = None |
|
|
|
|
|
def clear_prompts(): |
|
try: |
|
path = tempfile.gettempdir() |
|
for eachfile in os.listdir(path): |
|
filename = os.path.join(path, eachfile) |
|
if os.path.isfile(filename) and filename.endswith(".npz"): |
|
lastmodifytime = os.stat(filename).st_mtime |
|
endfiletime = time.time() - 60 |
|
if endfiletime > lastmodifytime: |
|
os.remove(filename) |
|
del path, filename, lastmodifytime, endfiletime |
|
gc.collect() |
|
except: |
|
return |
|
def transcribe_one(wav, sr): |
|
if sr != 16000: |
|
wav4trans = torchaudio.transforms.Resample(sr, 16000)(wav) |
|
else: |
|
wav4trans = wav |
|
|
|
input_features = whisper_processor(wav4trans.squeeze(0), sampling_rate=16000, return_tensors="pt").input_features |
|
|
|
|
|
predicted_ids = whisper.generate(input_features.to(device)) |
|
lang = whisper_processor.batch_decode(predicted_ids[:, 1])[0].strip("<|>") |
|
|
|
text_pr = whisper_processor.batch_decode(predicted_ids, skip_special_tokens=True)[0] |
|
|
|
|
|
print(text_pr) |
|
|
|
if text_pr.strip(" ")[-1] not in "?!.,。,?!。、": |
|
text_pr += "." |
|
|
|
|
|
del wav4trans, input_features, predicted_ids |
|
gc.collect() |
|
return lang, text_pr |
|
|
|
from data.tokenizer import ( |
|
AudioTokenizer, |
|
tokenize_audio, |
|
) |
|
|
|
def make_npz_prompt(name, uploaded_audio, recorded_audio, transcript_content): |
|
clear_prompts() |
|
audio_prompt = uploaded_audio if uploaded_audio is not None else recorded_audio |
|
sr, wav_pr = audio_prompt |
|
if len(wav_pr) / sr > 15: |
|
return "Rejected, Audio too long (should be less than 15 seconds)", None |
|
if not isinstance(wav_pr, torch.FloatTensor): |
|
wav_pr = torch.FloatTensor(wav_pr) |
|
if wav_pr.abs().max() > 1: |
|
wav_pr /= wav_pr.abs().max() |
|
if wav_pr.size(-1) == 2: |
|
wav_pr = wav_pr[:, 0] |
|
if wav_pr.ndim == 1: |
|
wav_pr = wav_pr.unsqueeze(0) |
|
assert wav_pr.ndim and wav_pr.size(0) == 1 |
|
|
|
if transcript_content == "": |
|
lang_pr, text_pr = transcribe_one(wav_pr, sr) |
|
lang_token = lang2token[lang_pr] |
|
text_pr = lang_token + text_pr + lang_token |
|
else: |
|
lang_pr = langid.classify(str(transcript_content))[0] |
|
lang_token = lang2token[lang_pr] |
|
transcript_content = transcript_content.replace("\n", "") |
|
text_pr = f"{lang_token}{str(transcript_content)}{lang_token}" |
|
|
|
encoded_frames = tokenize_audio(None, (wav_pr, sr)) |
|
audio_tokens = encoded_frames[0][0].transpose(2, 1).cpu().numpy() |
|
|
|
|
|
text_tokens = np.random.randint(0, 100, (1, 50)) |
|
|
|
message = f"Detected language: {lang_pr}\n Detected text: {text_pr}\n" |
|
if lang_pr not in ['ja', 'zh', 'en']: |
|
return f"Prompt can only made with one of model-supported languages, got {lang_pr} instead", None |
|
|
|
|
|
file_path = os.path.join(tempfile.gettempdir(), f"{name}.npz") |
|
np.savez(file_path, audio_tokens=audio_tokens, text_tokens=text_tokens, lang_code=lang2code[lang_pr]) |
|
|
|
|
|
del audio_tokens, text_tokens, lang_pr, text_pr, wav_pr, sr, uploaded_audio, recorded_audio |
|
gc.collect() |
|
return message, file_path |
|
|
|
def infer_from_prompt(text, language, accent, preset_prompt, prompt_file): |
|
if len(text) > 150: |
|
return "Rejected, Text too long (should be less than 150 characters)", None |
|
return f"Synthesized text: {text}", (24000, np.zeros(24000)) |
|
|
|
def get_available_npz_files(): |
|
|
|
return [f for f in os.listdir(tempfile.gettempdir()) if f.endswith(".npz")] |
|
|
|
|
|
with gr.Blocks() as app: |
|
with gr.Tabs(): |
|
|
|
with gr.Tab("NPZファイルを作成"): |
|
gr.Markdown("### 音声とテキストから .npz ファイルを作成") |
|
name = gr.Textbox(label="ファイル名", placeholder="保存する .npz ファイル名を入力") |
|
uploaded_audio = gr.Audio(label="アップロード音声", type="numpy") |
|
transcript_content = gr.Textbox(label="テキスト内容", placeholder="音声に対応する文字起こしを入力") |
|
result_message = gr.Textbox(label="結果", interactive=False) |
|
npz_output = gr.File(label=".npz ファイル") |
|
save_button = gr.Button("変換して保存") |
|
dummy_input = gr.Textbox(visible=False) |
|
|
|
save_button.click( |
|
make_npz_prompt, |
|
inputs=[name, uploaded_audio, dummy_input, transcript_content], |
|
outputs=[result_message, npz_output], |
|
) |
|
|
|
|
|
with gr.Tab("NPZファイルで生成"): |
|
gr.Markdown("### 保存した .npz ファイルから音声を生成") |
|
npz_files_dropdown = gr.Dropdown( |
|
label="利用可能な .npz ファイル", choices=get_available_npz_files(), interactive=True |
|
) |
|
text_input = gr.Textbox(label="生成するテキスト", placeholder="150文字以内のテキストを入力") |
|
language = gr.Radio( |
|
label="言語選択", |
|
choices=["auto-detect", "en", "ja", "zh"], |
|
value="auto-detect" |
|
) |
|
accent = gr.Radio( |
|
label="アクセント選択", |
|
choices=["no-accent", "en-accent", "ja-accent", "zh-accent"], |
|
value="no-accent" |
|
) |
|
preset_prompt = gr.Textbox(label="プロンプト名", placeholder="既存のプロンプトを選択") |
|
synthesis_message = gr.Textbox(label="結果", interactive=False) |
|
audio_output = gr.Audio(label="生成音声", type="numpy") |
|
generate_button = gr.Button("生成開始") |
|
|
|
generate_button.click( |
|
infer_from_prompt, |
|
inputs=[text_input, language, accent, preset_prompt, npz_files_dropdown], |
|
outputs=[synthesis_message, audio_output], |
|
) |
|
|
|
app.launch() |
|
|