import logging import os import pathlib import time import tempfile import platform import gc if platform.system().lower() == 'windows': temp = pathlib.PosixPath pathlib.PosixPath = pathlib.WindowsPath elif platform.system().lower() == 'linux': temp = pathlib.WindowsPath pathlib.WindowsPath = pathlib.PosixPath os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python" import langid langid.set_languages(['en', 'zh', 'ja']) import torch import torchaudio import numpy as np from data.tokenizer import ( AudioTokenizer, tokenize_audio, ) from data.collation import get_text_token_collater from models.vallex import VALLE from utils.g2p import PhonemeBpeTokenizer from descriptions import * from macros import * from examples import * import gradio as gr from vocos import Vocos from transformers import WhisperProcessor, WhisperForConditionalGeneration # 必要な事前設定 lang2token = {"en": "", "ja": "", "zh": ""} lang2code = {"en": 0, "ja": 1, "zh": 2} langid = None # ここでは仮定、適切なモジュールを初期化してください # モック用の関数(本番環境では適切に実装してください) def clear_prompts(): try: path = tempfile.gettempdir() for eachfile in os.listdir(path): filename = os.path.join(path, eachfile) if os.path.isfile(filename) and filename.endswith(".npz"): lastmodifytime = os.stat(filename).st_mtime endfiletime = time.time() - 60 if endfiletime > lastmodifytime: os.remove(filename) del path, filename, lastmodifytime, endfiletime gc.collect() except: return def transcribe_one(wav, sr): if sr != 16000: wav4trans = torchaudio.transforms.Resample(sr, 16000)(wav) else: wav4trans = wav input_features = whisper_processor(wav4trans.squeeze(0), sampling_rate=16000, return_tensors="pt").input_features # generate token ids predicted_ids = whisper.generate(input_features.to(device)) lang = whisper_processor.batch_decode(predicted_ids[:, 1])[0].strip("<|>") # decode token ids to text text_pr = whisper_processor.batch_decode(predicted_ids, skip_special_tokens=True)[0] # print the recognized text print(text_pr) if text_pr.strip(" ")[-1] not in "?!.,。,?!。、": text_pr += "." # delete all variables del wav4trans, input_features, predicted_ids gc.collect() return lang, text_pr from data.tokenizer import ( AudioTokenizer, tokenize_audio, ) def make_npz_prompt(name, uploaded_audio, recorded_audio, transcript_content): clear_prompts() audio_prompt = uploaded_audio if uploaded_audio is not None else recorded_audio sr, wav_pr = audio_prompt if len(wav_pr) / sr > 15: return "Rejected, Audio too long (should be less than 15 seconds)", None if not isinstance(wav_pr, torch.FloatTensor): wav_pr = torch.FloatTensor(wav_pr) if wav_pr.abs().max() > 1: wav_pr /= wav_pr.abs().max() if wav_pr.size(-1) == 2: wav_pr = wav_pr[:, 0] if wav_pr.ndim == 1: wav_pr = wav_pr.unsqueeze(0) assert wav_pr.ndim and wav_pr.size(0) == 1 if transcript_content == "": lang_pr, text_pr = transcribe_one(wav_pr, sr) lang_token = lang2token[lang_pr] text_pr = lang_token + text_pr + lang_token else: lang_pr = langid.classify(str(transcript_content))[0] lang_token = lang2token[lang_pr] transcript_content = transcript_content.replace("\n", "") text_pr = f"{lang_token}{str(transcript_content)}{lang_token}" # tokenize audio encoded_frames = tokenize_audio(None, (wav_pr, sr)) audio_tokens = encoded_frames[0][0].transpose(2, 1).cpu().numpy() # tokenize text text_tokens = np.random.randint(0, 100, (1, 50)) # 仮のトークン生成 message = f"Detected language: {lang_pr}\n Detected text: {text_pr}\n" if lang_pr not in ['ja', 'zh', 'en']: return f"Prompt can only made with one of model-supported languages, got {lang_pr} instead", None # save as npz file file_path = os.path.join(tempfile.gettempdir(), f"{name}.npz") np.savez(file_path, audio_tokens=audio_tokens, text_tokens=text_tokens, lang_code=lang2code[lang_pr]) # delete all variables del audio_tokens, text_tokens, lang_pr, text_pr, wav_pr, sr, uploaded_audio, recorded_audio gc.collect() return message, file_path def infer_from_prompt(text, language, accent, preset_prompt, prompt_file): if len(text) > 150: return "Rejected, Text too long (should be less than 150 characters)", None return f"Synthesized text: {text}", (24000, np.zeros(24000)) # 仮のオーディオ出力 def get_available_npz_files(): # 一時ディレクトリ内のすべての .npz ファイルをリストアップ return [f for f in os.listdir(tempfile.gettempdir()) if f.endswith(".npz")] # Gradio アプリケーション with gr.Blocks() as app: with gr.Tabs(): # NPZ作成タブ with gr.Tab("NPZファイルを作成"): gr.Markdown("### 音声とテキストから .npz ファイルを作成") name = gr.Textbox(label="ファイル名", placeholder="保存する .npz ファイル名を入力") uploaded_audio = gr.Audio(label="アップロード音声", type="numpy") transcript_content = gr.Textbox(label="テキスト内容", placeholder="音声に対応する文字起こしを入力") result_message = gr.Textbox(label="結果", interactive=False) npz_output = gr.File(label=".npz ファイル") save_button = gr.Button("変換して保存") dummy_input = gr.Textbox(visible=False) # ダミーコンポーネント save_button.click( make_npz_prompt, inputs=[name, uploaded_audio, dummy_input, transcript_content], outputs=[result_message, npz_output], ) # NPZ生成タブ with gr.Tab("NPZファイルで生成"): gr.Markdown("### 保存した .npz ファイルから音声を生成") npz_files_dropdown = gr.Dropdown( label="利用可能な .npz ファイル", choices=get_available_npz_files(), interactive=True ) text_input = gr.Textbox(label="生成するテキスト", placeholder="150文字以内のテキストを入力") language = gr.Radio( label="言語選択", choices=["auto-detect", "en", "ja", "zh"], value="auto-detect" ) accent = gr.Radio( label="アクセント選択", choices=["no-accent", "en-accent", "ja-accent", "zh-accent"], value="no-accent" ) preset_prompt = gr.Textbox(label="プロンプト名", placeholder="既存のプロンプトを選択") synthesis_message = gr.Textbox(label="結果", interactive=False) audio_output = gr.Audio(label="生成音声", type="numpy") generate_button = gr.Button("生成開始") generate_button.click( infer_from_prompt, inputs=[text_input, language, accent, preset_prompt, npz_files_dropdown], outputs=[synthesis_message, audio_output], ) app.launch()