Spaces:

fishaudio
/

openaudio-s1-mini

Running on L4

App Files Files Community

Stardust-minus commited on Jun 3

Commit

a26769d

verified ·

1 Parent(s): 39d5a3b

Upload folder using huggingface_hub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +5 -0
README.md +1 -1
app.py +73 -314
examples/Arabic.wav +0 -0
examples/English.wav +0 -0
examples/French.wav +0 -0
examples/German.wav +0 -0
examples/Japanese.wav +2 -2
examples/Korean.wav +2 -2
examples/Nice English Ref.wav +2 -2
examples/Spanish.wav +0 -0
fish_speech/configs/base.yaml +87 -87
fish_speech/configs/lora/r_8_alpha_16.yaml +4 -4
fish_speech/configs/modded_dac_vq.yaml +50 -0
fish_speech/configs/text2semantic_finetune.yaml +86 -83
fish_speech/content_sequence.py +367 -0
fish_speech/i18n/README.md +27 -27
fish_speech/i18n/__init__.py +3 -3
fish_speech/i18n/core.py +40 -40
fish_speech/i18n/locale/en_US.json +123 -123
fish_speech/i18n/locale/es_ES.json +123 -123
fish_speech/i18n/locale/ja_JP.json +123 -123
fish_speech/i18n/locale/ko_KR.json +123 -123
fish_speech/i18n/locale/pt_BR.json +133 -133
fish_speech/i18n/locale/zh_CN.json +123 -123
fish_speech/i18n/scan.py +122 -122
fish_speech/inference_engine/__init__.py +192 -0
fish_speech/inference_engine/reference_loader.py +130 -0
fish_speech/inference_engine/utils.py +29 -0
fish_speech/inference_engine/vq_manager.py +59 -0
fish_speech/models/dac/__init__.py +0 -0
fish_speech/models/dac/inference.py +123 -0
fish_speech/models/dac/modded_dac.py +1024 -0
fish_speech/models/dac/rvq.py +403 -0
fish_speech/models/text2semantic/inference.py +716 -0
fish_speech/models/text2semantic/lit_module.py +202 -202
fish_speech/models/text2semantic/llama.py +903 -887
fish_speech/models/text2semantic/lora.py +92 -92
fish_speech/text/__init__.py +4 -4
fish_speech/text/clean.py +37 -37
fish_speech/text/spliter.py +130 -130
fish_speech/tokenizer.py +179 -152
fish_speech/utils/__init__.py +24 -24
fish_speech/utils/braceexpand.py +217 -217
fish_speech/utils/context.py +13 -13
fish_speech/utils/file.py +139 -16
fish_speech/utils/instantiators.py +50 -50
fish_speech/utils/logger.py +55 -55
fish_speech/utils/logging_utils.py +48 -48
fish_speech/utils/rich_utils.py +100 -100

.gitattributes CHANGED Viewed

@@ -36,3 +36,8 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 examples/Japanese.wav filter=lfs diff=lfs merge=lfs -text
 examples/Korean.wav filter=lfs diff=lfs merge=lfs -text
 examples/Nice[[:space:]]English[[:space:]]Ref.wav filter=lfs diff=lfs merge=lfs -text

 examples/Japanese.wav filter=lfs diff=lfs merge=lfs -text
 examples/Korean.wav filter=lfs diff=lfs merge=lfs -text
 examples/Nice[[:space:]]English[[:space:]]Ref.wav filter=lfs diff=lfs merge=lfs -text
+examples/Arabic.wav filter=lfs diff=lfs merge=lfs -text
+examples/English.wav filter=lfs diff=lfs merge=lfs -text
+examples/French.wav filter=lfs diff=lfs merge=lfs -text
+examples/German.wav filter=lfs diff=lfs merge=lfs -text
+examples/Spanish.wav filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -1,5 +1,5 @@
 ---
-title: Fish Speech 1
 emoji: 🏆
 colorFrom: purple
 colorTo: gray

 ---
+title: OpenAudio S1
 emoji: 🏆
 colorFrom: purple
 colorTo: gray

app.py CHANGED Viewed

@@ -1,86 +1,51 @@
 import os
 import queue
 from huggingface_hub import snapshot_download
-import hydra
 import numpy as np
 import wave
 import io
-import pyrootutils
 import gc
 # Download if not exists
 os.makedirs("checkpoints", exist_ok=True)
-snapshot_download(repo_id="fishaudio/fish-speech-1.5", local_dir="./checkpoints/fish-speech-1.5")
 print("All checkpoints downloaded")
 import html
 import os
-import threading
 from argparse import ArgumentParser
 from pathlib import Path
-from functools import partial
 import gradio as gr
-import librosa
 import torch
 import torchaudio
 torchaudio.set_audio_backend("soundfile")
 from loguru import logger
-from transformers import AutoTokenizer
 from fish_speech.i18n import i18n
-from fish_speech.text.chn_text_norm.text import Text as ChnNormedText
-from fish_speech.utils import autocast_exclude_mps, set_seed
-from tools.api import decode_vq_tokens, encode_reference
-from tools.file import AUDIO_EXTENSIONS, list_files
-from tools.llama.generate import (
-    GenerateRequest,
-    GenerateResponse,
-    WrappedGenerateResponse,
-    launch_thread_safe_queue,
-)
-from tools.vqgan.inference import load_model as load_decoder_model
-from tools.schema import (
-    GLOBAL_NUM_SAMPLES,
-    ASRPackRequest,
-    ServeASRRequest,
-    ServeASRResponse,
-    ServeASRSegment,
-    ServeAudioPart,
-    ServeForwardMessage,
-    ServeMessage,
-    ServeRequest,
-    ServeResponse,
-    ServeStreamDelta,
-    ServeStreamResponse,
-    ServeTextPart,
-    ServeTimedASRResponse,
-    ServeTTSRequest,
-    ServeVQGANDecodeRequest,
-    ServeVQGANDecodeResponse,
-    ServeVQGANEncodeRequest,
-    ServeVQGANEncodeResponse,
-    ServeVQPart,
-    ServeReferenceAudio
-)
 # Make einx happy
 os.environ["EINX_FILTER_TRACEBACK"] = "false"
-HEADER_MD = """# Fish Speech
-## The demo in this space is version 1.5, Please check [Fish Audio](https://fish.audio) for the best model.
-## 该 Demo 为 Fish Speech 1.5 版本, 请在 [Fish Audio](https://fish.audio) 体验最新 DEMO.
-A text-to-speech model based on VQ-GAN and Llama developed by [Fish Audio](https://fish.audio).
-由 [Fish Audio](https://fish.audio) 研发的基于 VQ-GAN 和 Llama 的多语种语音合成.
-You can find the source code [here](https://github.com/fishaudio/fish-speech) and models [here](https://huggingface.co/fishaudio/fish-speech-1.5).
-你可以在 [这里](https://github.com/fishaudio/fish-speech) 找到源代码和 [这里](https://huggingface.co/fishaudio/fish-speech-1.5) 找到模型.
 Related code and weights are released under CC BY-NC-SA 4.0 License.
 相关代码，权重使用 CC BY-NC-SA 4.0 许可证发布.
@@ -88,8 +53,8 @@ Related code and weights are released under CC BY-NC-SA 4.0 License.
 We are not responsible for any misuse of the model, please consider your local laws and regulations before using it.
 我们不对模型的任何滥用负责，请在使用之前考虑您当地的法律法规.
-The model running in this WebUI is Fish Speech V1.5 Medium.
-在此 WebUI 中运行的模型是 Fish Speech V1.5 Medium.
 """
 TEXTBOX_PLACEHOLDER = """Put your text here. 在此处输入文本."""
@@ -106,7 +71,6 @@ except ImportError:
         return wrapper
 def build_html_error_message(error):
     return f"""
     <div style="color: red;
@@ -115,109 +79,6 @@ def build_html_error_message(error):
     </div>
     """
-@GPU_DECORATOR
-@torch.inference_mode()
-def inference(req: ServeTTSRequest):
-    try:
-        # Parse reference audio aka prompt
-        refs = req.references
-        prompt_tokens = [
-            encode_reference(
-                decoder_model=decoder_model,
-                reference_audio=ref.audio,
-                enable_reference_audio=True,
-            )
-            for ref in refs
-        ]
-        prompt_texts = [ref.text for ref in refs]
-        if req.seed is not None:
-            set_seed(req.seed)
-            logger.warning(f"set seed: {req.seed}")
-        # LLAMA Inference
-        request = dict(
-            device=decoder_model.device,
-            max_new_tokens=req.max_new_tokens,
-            text=(
-                req.text
-                if not req.normalize
-                else ChnNormedText(raw_text=req.text).normalize()
-            ),
-            top_p=req.top_p,
-            repetition_penalty=req.repetition_penalty,
-            temperature=req.temperature,
-            compile=args.compile,
-            iterative_prompt=req.chunk_length > 0,
-            chunk_length=req.chunk_length,
-            max_length=4096,
-            prompt_tokens=prompt_tokens,
-            prompt_text=prompt_texts,
-        )
-        response_queue = queue.Queue()
-        llama_queue.put(
-            GenerateRequest(
-                request=request,
-                response_queue=response_queue,
-            )
-        )
-        segments = []
-        while True:
-            result: WrappedGenerateResponse = response_queue.get()
-            if result.status == "error":
-                yield None, None, build_html_error_message(result.response)
-                break
-            result: GenerateResponse = result.response
-            if result.action == "next":
-                break
-            with autocast_exclude_mps(
-                device_type=decoder_model.device.type, dtype=args.precision
-            ):
-                fake_audios = decode_vq_tokens(
-                    decoder_model=decoder_model,
-                    codes=result.codes,
-                )
-            fake_audios = fake_audios.float().cpu().numpy()
-            segments.append(fake_audios)
-        if len(segments) == 0:
-            return (
-                None,
-                None,
-                build_html_error_message(
-                    i18n("No audio generated, please check the input text.")
-                ),
-            )
-        # No matter streaming or not, we need to return the final audio
-        audio = np.concatenate(segments, axis=0)
-        yield None, (decoder_model.spec_transform.sample_rate, audio), None
-        if torch.cuda.is_available():
-            torch.cuda.empty_cache()
-            gc.collect()
-    except Exception as e:
-        er = "CUDA error: device-side assert triggered"
-        if er in str(e):
-            app.close()
-        else:
-            raise Exception(e)
-n_audios = 4
-global_audio_list = []
-global_error_list = []
 def wav_chunk_header(sample_rate=44100, bit_depth=16, channels=1):
     buffer = io.BytesIO()
@@ -230,13 +91,8 @@ def wav_chunk_header(sample_rate=44100, bit_depth=16, channels=1):
     buffer.close()
     return wav_header_bytes
-def normalize_text(user_input, use_normalization):
-    if use_normalization:
-        return ChnNormedText(raw_text=user_input).normalize()
-    else:
-        return user_input
-def build_app():
     with gr.Blocks(theme=gr.themes.Base()) as app:
         gr.Markdown(HEADER_MD)
@@ -245,7 +101,7 @@ def build_app():
             None,
             None,
             js="() => {const params = new URLSearchParams(window.location.search);if (!params.has('__theme')) {params.set('__theme', '%s');window.location.search = params.toString();}}"
-            % args.theme,
         )
         # Inference
@@ -254,20 +110,6 @@ def build_app():
                 text = gr.Textbox(
                     label=i18n("Input Text"), placeholder=TEXTBOX_PLACEHOLDER, lines=10
                 )
-                refined_text = gr.Textbox(
-                    label=i18n("Realtime Transform Text"),
-                    placeholder=i18n(
-                        "Normalization Result Preview (Currently Only Chinese)"
-                    ),
-                    lines=5,
-                    interactive=False,
-                )
-                with gr.Row():
-                    normalize = gr.Checkbox(
-                        label=i18n("Text Normalization"),
-                        value=False,
-                    )
                 with gr.Row():
                     with gr.Column():
@@ -275,45 +117,45 @@ def build_app():
                             with gr.Row():
                                 chunk_length = gr.Slider(
                                     label=i18n("Iterative Prompt Length, 0 means off"),
-                                    minimum=0,
-                                    maximum=300,
-                                    value=200,
                                     step=8,
                                 )
                                 max_new_tokens = gr.Slider(
                                     label=i18n(
-                                        "Maximum tokens per batch"
                                     ),
-                                    minimum=512,
                                     maximum=2048,
-                                    value=1024,
-                                    step=64,
                                 )
                             with gr.Row():
                                 top_p = gr.Slider(
                                     label="Top-P",
-                                    minimum=0.6,
-                                    maximum=0.9,
-                                    value=0.7,
                                     step=0.01,
                                 )
                                 repetition_penalty = gr.Slider(
                                     label=i18n("Repetition Penalty"),
                                     minimum=1,
-                                    maximum=1.5,
-                                    value=1.2,
                                     step=0.01,
                                 )
                             with gr.Row():
                                 temperature = gr.Slider(
                                     label="Temperature",
-                                    minimum=0.6,
-                                    maximum=0.9,
-                                    value=0.7,
                                     step=0.01,
                                 )
                                 seed = gr.Number(
@@ -326,24 +168,20 @@ def build_app():
                             with gr.Row():
                                 gr.Markdown(
                                     i18n(
-                                        "15 to 60 seconds of reference audio, useful for specifying speaker."
                                     )
                                 )
                             with gr.Row():
-                                # Add dropdown for selecting example audio files
-                                example_audio_files = [f for f in os.listdir("examples") if f.endswith(".wav")]
-                                example_audio_dropdown = gr.Dropdown(
-                                    label="Select Example Audio",
-                                    choices=[""] + example_audio_files,
-                                    value=""
                                 )
                             with gr.Row():
                                 use_memory_cache = gr.Radio(
                                     label=i18n("Use Memory Cache"),
-                                    choices=["never"],
-                                    value="never",
                                 )
                             with gr.Row():
@@ -351,7 +189,6 @@ def build_app():
                                     label=i18n("Reference Audio"),
                                     type="filepath",
                                 )
                             with gr.Row():
                                 reference_text = gr.Textbox(
                                     label=i18n("Reference Text"),
@@ -377,101 +214,16 @@ def build_app():
                 with gr.Row():
                     with gr.Column(scale=3):
                         generate = gr.Button(
-                            value="\U0001F3A7 " + i18n("Generate"), variant="primary"
                         )
-        text.input(
-            fn=normalize_text, inputs=[text, normalize], outputs=[refined_text]
-        )
-        def inference_wrapper(
-            text,
-            normalize,
-            reference_audio,
-            reference_text,
-            max_new_tokens,
-            chunk_length,
-            top_p,
-            repetition_penalty,
-            temperature,
-            seed,
-            use_memory_cache,
-        ):
-            print(
-                "call inference wrapper",
-                text,
-                normalize,
-                reference_audio,
-                reference_text,
-                max_new_tokens,
-                chunk_length,
-                top_p,
-                repetition_penalty,
-                temperature,
-                seed,
-                use_memory_cache
-            )
-            references = []
-            if reference_audio:
-                # 将文件路径转换为字节
-                with open(reference_audio, 'rb') as audio_file:
-                    audio_bytes = audio_file.read()
-                references = [
-                    ServeReferenceAudio(audio=audio_bytes, text=reference_text)
-                ]
-            req = ServeTTSRequest(
-                text=text,
-                normalize=normalize,
-                reference_id=None,
-                references=references,
-                max_new_tokens=max_new_tokens,
-                chunk_length=chunk_length,
-                top_p=top_p,
-                repetition_penalty=repetition_penalty,
-                temperature=temperature,
-                seed=int(seed) if seed else None,
-                use_memory_cache=use_memory_cache,
-            )
-            for result in inference(req):
-                if result[2]:  # Error message
-                    return None, result[2]
-                elif result[1]:  # Audio data
-                    return result[1], None
-            return None, i18n("No audio generated")
-        def select_example_audio(audio_file):
-            if audio_file:
-                audio_path = os.path.join("examples", audio_file)
-                lab_file = os.path.splitext(audio_file)[0] + ".lab"
-                lab_path = os.path.join("examples", lab_file)
-                if os.path.exists(lab_path):
-                    with open(lab_path, "r", encoding="utf-8") as f:
-                        lab_content = f.read().strip()
-                else:
-                    lab_content = ""
-                return audio_path, lab_content
-            return None, ""
-        # Connect the dropdown to update reference audio and text
-        example_audio_dropdown.change(
-            fn=select_example_audio,
-            inputs=[example_audio_dropdown],
-            outputs=[reference_audio, reference_text]
-        )
         # Submit
         generate.click(
-            inference_wrapper,
             [
-                refined_text,
-                normalize,
                 reference_audio,
                 reference_text,
                 max_new_tokens,
@@ -488,26 +240,24 @@ def build_app():
     return app
 def parse_args():
     parser = ArgumentParser()
     parser.add_argument(
         "--llama-checkpoint-path",
         type=Path,
-        default="checkpoints/fish-speech-1.5",
     )
     parser.add_argument(
         "--decoder-checkpoint-path",
         type=Path,
-        default="checkpoints/fish-speech-1.5/firefly-gan-vq-fsq-8x1024-21hz-generator.pth",
     )
-    parser.add_argument("--decoder-config-name", type=str, default="firefly_gan_vq")
     parser.add_argument("--device", type=str, default="cuda")
     parser.add_argument("--half", action="store_true")
     parser.add_argument("--compile", action="store_true",default=True)
     parser.add_argument("--max-gradio-length", type=int, default=0)
-    parser.add_argument("--theme", type=str, default="light")
     return parser.parse_args()
@@ -533,25 +283,34 @@ if __name__ == "__main__":
     logger.info("Decoder model loaded, warming up...")
     # Dry run to check if the model is loaded correctly and avoid the first-time latency
     list(
-            inference(
-                ServeTTSRequest(
-                    text="Hello world.",
-                    references=[],
-                    reference_id=None,
-                    max_new_tokens=0,
-                    chunk_length=200,
-                    top_p=0.7,
-                    repetition_penalty=1.5,
-                    temperature=0.7,
-                    emotion=None,
-                    format="wav",
-                )
             )
     )
     logger.info("Warming up done, launching the web UI...")
-    app = build_app()
-    app.queue(api_open=True).launch(show_error=True, show_api=True)

 import os
 import queue
 from huggingface_hub import snapshot_download
 import numpy as np
 import wave
 import io
 import gc
+from typing import Callable
 # Download if not exists
 os.makedirs("checkpoints", exist_ok=True)
+snapshot_download(repo_id="fishaudio/openaudio-s1-mini", local_dir="./checkpoints/openaudio-s1-mini")
 print("All checkpoints downloaded")
 import html
 import os
 from argparse import ArgumentParser
 from pathlib import Path
 import gradio as gr
 import torch
 import torchaudio
 torchaudio.set_audio_backend("soundfile")
 from loguru import logger
 from fish_speech.i18n import i18n
+from fish_speech.inference_engine import TTSInferenceEngine
+from fish_speech.models.dac.inference import load_model as load_decoder_model
+from fish_speech.models.text2semantic.inference import launch_thread_safe_queue
+from tools.webui.inference import get_inference_wrapper
+from fish_speech.utils.schema import ServeTTSRequest
 # Make einx happy
 os.environ["EINX_FILTER_TRACEBACK"] = "false"
+HEADER_MD = """# OpenAudio S1
+## The demo in this space is OpenAudio S1, Please check [Fish Audio](https://fish.audio) for the best model.
+## 该 Demo 为 OpenAudio S1 版本, 请在 [Fish Audio](https://fish.audio) 体验最新 DEMO.
+A text-to-speech model based on DAC and Qwen3 developed by [Fish Audio](https://fish.audio).
+由 [Fish Audio](https://fish.audio) 研发的基于 DAC 和 Qwen3 的多语种语音合成.
+You can find the source code [here](https://github.com/fishaudio/fish-speech) and models [here](https://huggingface.co/fishaudio/openaudio-s1-mini).
+你可以在 [这里](https://github.com/fishaudio/fish-speech) 找到源代码和 [这里](https://huggingface.co/fishaudio/openaudio-s1-mini) 找到模型.
 Related code and weights are released under CC BY-NC-SA 4.0 License.
 相关代码，权重使用 CC BY-NC-SA 4.0 许可证发布.
 We are not responsible for any misuse of the model, please consider your local laws and regulations before using it.
 我们不对模型的任何滥用负责，请在使用之前考虑您当地的法律法规.
+The model running in this WebUI is OpenAudio S1 Mini.
+在此 WebUI 中运行的模型是 OpenAudio S1 Mini.
 """
 TEXTBOX_PLACEHOLDER = """Put your text here. 在此处输入文本."""
         return wrapper
 def build_html_error_message(error):
     return f"""
     <div style="color: red;
     </div>
     """
 def wav_chunk_header(sample_rate=44100, bit_depth=16, channels=1):
     buffer = io.BytesIO()
     buffer.close()
     return wav_header_bytes
+def build_app(inference_fct: Callable, theme: str = "light") -> gr.Blocks:
     with gr.Blocks(theme=gr.themes.Base()) as app:
         gr.Markdown(HEADER_MD)
             None,
             None,
             js="() => {const params = new URLSearchParams(window.location.search);if (!params.has('__theme')) {params.set('__theme', '%s');window.location.search = params.toString();}}"
+            % theme,
         )
         # Inference
                 text = gr.Textbox(
                     label=i18n("Input Text"), placeholder=TEXTBOX_PLACEHOLDER, lines=10
                 )
                 with gr.Row():
                     with gr.Column():
                             with gr.Row():
                                 chunk_length = gr.Slider(
                                     label=i18n("Iterative Prompt Length, 0 means off"),
+                                    minimum=100,
+                                    maximum=400,
+                                    value=300,
                                     step=8,
                                 )
                                 max_new_tokens = gr.Slider(
                                     label=i18n(
+                                        "Maximum tokens per batch, 0 means no limit"
                                     ),
+                                    minimum=0,
                                     maximum=2048,
+                                    value=0,
+                                    step=8,
                                 )
                             with gr.Row():
                                 top_p = gr.Slider(
                                     label="Top-P",
+                                    minimum=0.7,
+                                    maximum=0.95,
+                                    value=0.8,
                                     step=0.01,
                                 )
                                 repetition_penalty = gr.Slider(
                                     label=i18n("Repetition Penalty"),
                                     minimum=1,
+                                    maximum=1.2,
+                                    value=1.1,
                                     step=0.01,
                                 )
                             with gr.Row():
                                 temperature = gr.Slider(
                                     label="Temperature",
+                                    minimum=0.7,
+                                    maximum=1.0,
+                                    value=0.8,
                                     step=0.01,
                                 )
                                 seed = gr.Number(
                             with gr.Row():
                                 gr.Markdown(
                                     i18n(
+                                        "5 to 10 seconds of reference audio, useful for specifying speaker."
                                     )
                                 )
                             with gr.Row():
+                                reference_id = gr.Textbox(
+                                    label=i18n("Reference ID"),
+                                    placeholder="Leave empty to use uploaded references",
                                 )
                             with gr.Row():
                                 use_memory_cache = gr.Radio(
                                     label=i18n("Use Memory Cache"),
+                                    choices=["on", "off"],
+                                    value="on",
                                 )
                             with gr.Row():
                                     label=i18n("Reference Audio"),
                                     type="filepath",
                                 )
                             with gr.Row():
                                 reference_text = gr.Textbox(
                                     label=i18n("Reference Text"),
                 with gr.Row():
                     with gr.Column(scale=3):
                         generate = gr.Button(
+                            value="\U0001f3a7 " + i18n("Generate"),
+                            variant="primary",
                         )
         # Submit
         generate.click(
+            inference_fct,
             [
+                text,
+                reference_id,
                 reference_audio,
                 reference_text,
                 max_new_tokens,
     return app
 def parse_args():
     parser = ArgumentParser()
     parser.add_argument(
         "--llama-checkpoint-path",
         type=Path,
+        default="checkpoints/openaudio-s1-mini",
     )
     parser.add_argument(
         "--decoder-checkpoint-path",
         type=Path,
+        default="checkpoints/openaudio-s1-mini/codec.pth",
     )
+    parser.add_argument("--decoder-config-name", type=str, default="modded_dac_vq")
     parser.add_argument("--device", type=str, default="cuda")
     parser.add_argument("--half", action="store_true")
     parser.add_argument("--compile", action="store_true",default=True)
     parser.add_argument("--max-gradio-length", type=int, default=0)
+    parser.add_argument("--theme", type=str, default="dark")
     return parser.parse_args()
     logger.info("Decoder model loaded, warming up...")
+    # Create the inference engine
+    inference_engine = TTSInferenceEngine(
+        llama_queue=llama_queue,
+        decoder_model=decoder_model,
+        compile=args.compile,
+        precision=args.precision,
+    )
     # Dry run to check if the model is loaded correctly and avoid the first-time latency
     list(
+        inference_engine.inference(
+            ServeTTSRequest(
+                text="Hello world.",
+                references=[],
+                reference_id=None,
+                max_new_tokens=1024,
+                chunk_length=200,
+                top_p=0.7,
+                repetition_penalty=1.5,
+                temperature=0.7,
+                format="wav",
             )
+        )
     )
     logger.info("Warming up done, launching the web UI...")
+    inference_fct = get_inference_wrapper(inference_engine)
+    app = build_app(inference_fct, args.theme)
+    app.queue(api_open=True).launch(show_error=True, show_api=True, server_name="0.0.0.0", server_port=18888)

examples/Arabic.wav CHANGED Viewed

Binary files a/examples/Arabic.wav and b/examples/Arabic.wav differ

examples/English.wav CHANGED Viewed

Binary files a/examples/English.wav and b/examples/English.wav differ

examples/French.wav CHANGED Viewed

Binary files a/examples/French.wav and b/examples/French.wav differ

examples/German.wav CHANGED Viewed

Binary files a/examples/German.wav and b/examples/German.wav differ

examples/Japanese.wav CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a23cffeac70f42e1cc69e2a0505e4c1fda50884dd34c509128d432aaf44565e5
-size 1148682

 version https://git-lfs.github.com/spec/v1
+oid sha256:3034a38260884be854cb4a3f6cb648db85ebdeeb8cab74cfae2a578dc7aaedc2
+size 132

examples/Korean.wav CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:c4234f119c741782e2c9c0ede4b5b864a560a355c28a23b2332e79420b69961a
-size 1632522

 version https://git-lfs.github.com/spec/v1
+oid sha256:5767663f0c26f4dc94f45227f385c2be568aac065272466915d65eaa64fdda0f
+size 132

examples/Nice English Ref.wav CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:d00ad9768c62f9821fc01ecab3e02669581ca75c18af6549690e19ce90a09f53
-size 5254482

 version https://git-lfs.github.com/spec/v1
+oid sha256:4b707de0cfc5d2eee59dcc3fea495603fe28d95ca64d8202bcdb31537d588782
+size 132

examples/Spanish.wav CHANGED Viewed

Binary files a/examples/Spanish.wav and b/examples/Spanish.wav differ

fish_speech/configs/base.yaml CHANGED Viewed

@@ -1,87 +1,87 @@
-# Base configuration for training a model
-paths:
-  run_dir: results/${project}
-  ckpt_dir: ${paths.run_dir}/checkpoints
-hydra:
-  run:
-    dir: ${paths.run_dir}
-# Lightning Trainer
-trainer:
-  _target_: lightning.pytorch.trainer.Trainer
-  default_root_dir: ${paths.run_dir}
-  accelerator: gpu
-  num_nodes: 1
-  devices: auto
-  strategy:
-    _target_: lightning.pytorch.strategies.DDPStrategy
-    process_group_backend: nccl  # This should be override when training on windows
-  precision: bf16-mixed
-  # disable validation by epoch end
-  check_val_every_n_epoch: null
-  val_check_interval: 5000
-  max_steps: 100_000
-  # Use torch.backends.cudnn.benchmark to speed up training
-  benchmark: true
-# Callbacks
-callbacks:
-  model_checkpoint:
-    _target_: lightning.pytorch.callbacks.ModelCheckpoint
-    dirpath: ${paths.ckpt_dir}
-    filename: "step_{step:09d}"
-    save_last: false # additionally always save an exact copy of the last checkpoint to a file last.ckpt
-    save_top_k: 5 # save 5 latest checkpoints
-    monitor: step # use step to monitor checkpoints
-    mode: max # save the latest checkpoint with the highest global_step
-    every_n_epochs: null # don't save checkpoints by epoch end
-    every_n_train_steps: 5000 # save checkpoints every 5000 steps
-    auto_insert_metric_name: false
-  model_summary:
-    _target_: lightning.pytorch.callbacks.ModelSummary
-    max_depth: 2 # the maximum depth of layer nesting that the summary will include
-  learning_rate_monitor:
-    _target_: lightning.pytorch.callbacks.LearningRateMonitor
-    logging_interval: step
-    log_momentum: false
-  grad_norm_monitor:
-    _target_: fish_speech.callbacks.GradNormMonitor
-    norm_type: 2
-    logging_interval: step
-# Logger
-logger:
-  tensorboard:
-    _target_: lightning.pytorch.loggers.tensorboard.TensorBoardLogger
-    save_dir: "${paths.run_dir}/tensorboard/"
-    name: null
-    log_graph: false
-    default_hp_metric: true
-    prefix: ""
-  # wandb:
-  #   _target_: lightning.pytorch.loggers.wandb.WandbLogger
-  #   # name: "" # name of the run (normally generated by wandb)
-  #   save_dir: "${paths.run_dir}"
-  #   offline: False
-  #   id: null # pass correct id to resume experiment!
-  #   anonymous: null # enable anonymous logging
-  #   project: "fish-speech"
-  #   log_model: False # upload lightning ckpts
-  #   prefix: "" # a string to put at the beginning of metric keys
-  #   # entity: "" # set to name of your wandb team
-  #   group: ""
-  #   tags: ["vq", "hq", "finetune"]
-  #   job_type: ""
-# Loop
-train: true
-test: false

+# Base configuration for training a model
+paths:
+  run_dir: results/${project}
+  ckpt_dir: ${paths.run_dir}/checkpoints
+hydra:
+  run:
+    dir: ${paths.run_dir}
+# Lightning Trainer
+trainer:
+  _target_: lightning.pytorch.trainer.Trainer
+  default_root_dir: ${paths.run_dir}
+  accelerator: gpu
+  num_nodes: 1
+  devices: auto
+  strategy:
+    _target_: lightning.pytorch.strategies.DDPStrategy
+    process_group_backend: nccl  # This should be override when training on windows
+  precision: bf16-mixed
+  # disable validation by epoch end
+  check_val_every_n_epoch: null
+  val_check_interval: 5000
+  max_steps: 100_000
+  # Use torch.backends.cudnn.benchmark to speed up training
+  benchmark: true
+# Callbacks
+callbacks:
+  model_checkpoint:
+    _target_: lightning.pytorch.callbacks.ModelCheckpoint
+    dirpath: ${paths.ckpt_dir}
+    filename: "step_{step:09d}"
+    save_last: false # additionally always save an exact copy of the last checkpoint to a file last.ckpt
+    save_top_k: 5 # save 5 latest checkpoints
+    monitor: step # use step to monitor checkpoints
+    mode: max # save the latest checkpoint with the highest global_step
+    every_n_epochs: null # don't save checkpoints by epoch end
+    every_n_train_steps: 5000 # save checkpoints every 5000 steps
+    auto_insert_metric_name: false
+  model_summary:
+    _target_: lightning.pytorch.callbacks.ModelSummary
+    max_depth: 2 # the maximum depth of layer nesting that the summary will include
+  learning_rate_monitor:
+    _target_: lightning.pytorch.callbacks.LearningRateMonitor
+    logging_interval: step
+    log_momentum: false
+  grad_norm_monitor:
+    _target_: fish_speech.callbacks.GradNormMonitor
+    norm_type: 2
+    logging_interval: step
+# Logger
+logger:
+  tensorboard:
+    _target_: lightning.pytorch.loggers.tensorboard.TensorBoardLogger
+    save_dir: "${paths.run_dir}/tensorboard/"
+    name: null
+    log_graph: false
+    default_hp_metric: true
+    prefix: ""
+  # wandb:
+  #   _target_: lightning.pytorch.loggers.wandb.WandbLogger
+  #   # name: "" # name of the run (normally generated by wandb)
+  #   save_dir: "${paths.run_dir}"
+  #   offline: False
+  #   id: null # pass correct id to resume experiment!
+  #   anonymous: null # enable anonymous logging
+  #   project: "fish-speech"
+  #   log_model: False # upload lightning ckpts
+  #   prefix: "" # a string to put at the beginning of metric keys
+  #   # entity: "" # set to name of your wandb team
+  #   group: ""
+  #   tags: ["vq", "hq", "finetune"]
+  #   job_type: ""
+# Loop
+train: true
+test: false

fish_speech/configs/lora/r_8_alpha_16.yaml CHANGED Viewed

@@ -1,4 +1,4 @@
-_target_: fish_speech.models.text2semantic.lora.LoraConfig
-r: 8
-lora_alpha: 16
-lora_dropout: 0.01

+_target_: fish_speech.models.text2semantic.lora.LoraConfig
+r: 8
+lora_alpha: 16
+lora_dropout: 0.01

fish_speech/configs/modded_dac_vq.yaml ADDED Viewed

	@@ -0,0 +1,50 @@

+_target_: fish_speech.models.dac.modded_dac.DAC
+# Model setup
+sample_rate: 44100
+encoder_dim: 64
+encoder_rates: [2, 4, 8, 8]
+decoder_dim: 1536
+decoder_rates: [8, 8, 4, 2]
+encoder_transformer_layers: [0, 0, 0, 4]
+decoder_transformer_layers: [4, 0, 0, 0]
+transformer_general_config:
+  _target_: fish_speech.models.dac.modded_dac.ModelArgs
+  _partial_: true
+  block_size: 16384
+  n_local_heads: -1
+  head_dim: 64
+  rope_base: 10000
+  norm_eps: 1e-5
+  dropout_rate: 0.1
+  attn_dropout_rate: 0.1
+  channels_first: true
+# Quantization
+quantizer:
+  _target_: fish_speech.models.dac.rvq.DownsampleResidualVectorQuantize
+  input_dim: 1024
+  n_codebooks: 9
+  codebook_size: 1024
+  codebook_dim: 8
+  quantizer_dropout: 0.5
+  downsample_factor: [2, 2]
+  post_module: &transformer_module
+    _target_: fish_speech.models.dac.modded_dac.WindowLimitedTransformer
+    causal: true
+    window_size: 128  # empirically this does not seem to matter
+    input_dim: 1024
+    config: &transformer_config
+      _target_: fish_speech.models.dac.modded_dac.ModelArgs
+      block_size: 4096
+      n_layer: 8
+      n_head: 16
+      dim: 1024
+      intermediate_size: 3072
+      n_local_heads: -1
+      head_dim: 64
+      rope_base: 10000
+      norm_eps: 1e-5
+      dropout_rate: 0.1
+      attn_dropout_rate: 0.1
+      channels_first: true
+  pre_module: *transformer_module
+  semantic_codebook_size: 4096

fish_speech/configs/text2semantic_finetune.yaml CHANGED Viewed

@@ -1,83 +1,86 @@
-defaults:
-  - base
-  - _self_
-project: text2semantic_finetune_dual_ar
-max_length: 4096
-pretrained_ckpt_path: checkpoints/fish-speech-1.4
-# Lightning Trainer
-trainer:
-  accumulate_grad_batches: 1
-  gradient_clip_val: 1.0
-  gradient_clip_algorithm: "norm"
-  max_steps: 1000
-  precision: bf16-true
-  limit_val_batches: 10
-  val_check_interval: 100
-# Dataset Configuration
-tokenizer:
-  _target_: transformers.AutoTokenizer.from_pretrained
-  pretrained_model_name_or_path: ${pretrained_ckpt_path}
-# Dataset Configuration
-train_dataset:
-  _target_: fish_speech.datasets.semantic.AutoTextSemanticInstructionDataset
-  proto_files:
-    - data/protos
-  tokenizer: ${tokenizer}
-  causal: true
-  max_length: ${max_length}
-  use_speaker: false
-  interactive_prob: 0.7
-val_dataset:
-  _target_: fish_speech.datasets.semantic.AutoTextSemanticInstructionDataset
-  proto_files:
-    - data/protos
-  tokenizer: ${tokenizer}
-  causal: true
-  max_length: ${max_length}
-  use_speaker: false
-  interactive_prob: 0.7
-data:
-  _target_: fish_speech.datasets.semantic.SemanticDataModule
-  train_dataset: ${train_dataset}
-  val_dataset: ${val_dataset}
-  num_workers: 4
-  batch_size: 8
-  tokenizer: ${tokenizer}
-  max_length: ${max_length}
-# Model Configuration
-model:
-  _target_: fish_speech.models.text2semantic.lit_module.TextToSemantic
-  model:
-    _target_: fish_speech.models.text2semantic.llama.BaseTransformer.from_pretrained
-    path: ${pretrained_ckpt_path}
-    load_weights: true
-    max_length: ${max_length}
-    lora_config: null
-  optimizer:
-    _target_: torch.optim.AdamW
-    _partial_: true
-    lr: 1e-4
-    weight_decay: 0
-    betas: [0.9, 0.95]
-    eps: 1e-5
-  lr_scheduler:
-    _target_: torch.optim.lr_scheduler.LambdaLR
-    _partial_: true
-    lr_lambda:
-      _target_: fish_speech.scheduler.get_constant_schedule_with_warmup_lr_lambda
-      _partial_: true
-      num_warmup_steps: 10
-# Callbacks
-callbacks:
-  model_checkpoint:
-    every_n_train_steps: ${trainer.val_check_interval}

+defaults:
+  - base
+  - _self_
+project: text2semantic_finetune_dual_ar
+max_length: 4096
+pretrained_ckpt_path: checkpoints/openaudio-s1-mini
+# Lightning Trainer
+trainer:
+  accumulate_grad_batches: 1
+  gradient_clip_val: 1.0
+  gradient_clip_algorithm: "norm"
+  max_steps: 10000
+  precision: bf16-true
+  limit_val_batches: 10
+  val_check_interval: 100
+  # strategy:
+  #   find_unused_parameters: true
+  #   static_graph: true
+# Dataset Configuration
+tokenizer:
+  _target_: fish_speech.tokenizer.FishTokenizer
+  model_path: ${pretrained_ckpt_path}/tokenizer.tiktoken
+# Dataset Configuration
+train_dataset:
+  _target_: fish_speech.datasets.semantic.AutoTextSemanticInstructionIterableDataset
+  proto_files:
+    - data/protos
+  tokenizer: ${tokenizer}
+  causal: true
+  max_length: ${max_length}
+  use_speaker: false
+  interactive_prob: 0.7
+val_dataset:
+  _target_: fish_speech.datasets.semantic.AutoTextSemanticInstructionIterableDataset
+  proto_files:
+    - data/protos
+  tokenizer: ${tokenizer}
+  causal: true
+  max_length: ${max_length}
+  use_speaker: false
+  interactive_prob: 0.7
+data:
+  _target_: fish_speech.datasets.semantic.SemanticDataModule
+  train_dataset: ${train_dataset}
+  val_dataset: ${val_dataset}
+  num_workers: 4
+  batch_size: 4
+  tokenizer: ${tokenizer}
+  max_length: ${max_length}
+# Model Configuration
+model:
+  _target_: fish_speech.models.text2semantic.lit_module.TextToSemantic
+  model:
+    _target_: fish_speech.models.text2semantic.llama.BaseTransformer.from_pretrained
+    path: ${pretrained_ckpt_path}
+    load_weights: true
+    max_length: ${max_length}
+    lora_config: null
+  optimizer:
+    _target_: torch.optim.AdamW
+    _partial_: true
+    lr: 1e-4
+    weight_decay: 0
+    betas: [0.9, 0.95]
+    eps: 1e-5
+  lr_scheduler:
+    _target_: torch.optim.lr_scheduler.LambdaLR
+    _partial_: true
+    lr_lambda:
+      _target_: fish_speech.scheduler.get_constant_schedule_with_warmup_lr_lambda
+      _partial_: true
+      num_warmup_steps: 10
+# Callbacks
+callbacks:
+  model_checkpoint:
+    every_n_train_steps: ${trainer.val_check_interval}

fish_speech/content_sequence.py ADDED Viewed

	@@ -0,0 +1,367 @@

+from dataclasses import dataclass, field
+from typing import List, Literal, Union
+import numpy as np
+import torch
+from fish_speech.tokenizer import (
+    IM_END_TOKEN,
+    MODALITY_TOKENS,
+    FishTokenizer,
+)
+def restore_ndarray(obj, to_tensor: bool = False):
+    if isinstance(obj, dict) and "__ndarray__" in obj:
+        obj = np.frombuffer(obj["data"], dtype=obj["dtype"]).reshape(obj["shape"])
+    if to_tensor and isinstance(obj, np.ndarray):
+        obj = torch.from_numpy(obj.copy())
+    return obj
+@dataclass
+class BasePart:
+    type: Literal["text", "vq", "audio"] | None = None
+    cal_loss: bool = False
+@dataclass(kw_only=True)
+class VQPart(BasePart):
+    type = "vq"
+    codes: torch.Tensor
+    def __post_init__(self: "VQPart"):
+        self.type = "vq"
+        self.codes = restore_ndarray(self.codes, to_tensor=True)
+@dataclass(kw_only=True)
+class TextPart(BasePart):
+    type = "text"
+    text: str | None = None
+    tokens: list[int] | None = None
+    def __post_init__(self: "TextPart"):
+        self.type = "text"
+        if self.text is None and self.tokens is None:
+            raise ValueError("Either text or tokens must be provided")
+@dataclass(kw_only=True)
+class AudioPart(BasePart):
+    type = "audio"
+    features: torch.Tensor
+    def __post_init__(self: "AudioPart"):
+        self.type = "audio"
+        self.features = restore_ndarray(self.features, to_tensor=True)
+@dataclass(kw_only=True)
+class EncodedMessage:
+    tokens: torch.Tensor
+    labels: torch.Tensor
+    vq_mask_tokens: torch.Tensor | None = None
+    vq_mask_labels: torch.Tensor | None = None
+    vq_parts: list[torch.Tensor]
+    vq_require_losses: torch.Tensor | None = None
+    audio_parts: list[torch.Tensor]
+    audio_masks: torch.Tensor | None = None
+    metadata: dict | None = None
+@dataclass
+class ContentSequence:
+    """
+    Flexible sequence of content parts that supports interleaved multimodal format.
+    Example format: <|interleave|><|speaker:1|> TEXT AUDIO <|im_end|><|speaker:2|> TEXT AUDIO <|im_end|>
+    """
+    parts: list[BasePart] = field(default_factory=list)
+    modality: Literal["text", "voice", "interleave"] | None = None
+    metadata: dict | None = None
+    def __init__(
+        self: "ContentSequence",
+        parts: list[BasePart | dict] | None = None,
+        modality: Literal["text", "voice", "interleave"] | None = None,
+        metadata: dict | None = None,
+    ):
+        self.modality = modality
+        self.metadata = metadata or {}
+        fixed_parts = []
+        for part in parts or []:
+            if isinstance(part, dict):
+                if part["type"] == "vq":
+                    part = VQPart(**part)
+                elif part["type"] == "audio":
+                    part = AudioPart(**part)
+                elif part["type"] == "text":
+                    part = TextPart(**part)
+                else:
+                    raise ValueError(f"Unsupported part type: {part['type']}")
+            fixed_parts.append(part)
+        self.parts = fixed_parts
+        # If modality is specified, add it at the beginning if it's not already there
+        if self.modality and not (
+            len(self.parts) > 0
+            and isinstance(self.parts[0], dict) is False
+            and isinstance(self.parts[0], TextPart)
+            and self.parts[0].text is not None
+            and self.parts[0].text.startswith(MODALITY_TOKENS[self.modality])
+        ):
+            modality_token = MODALITY_TOKENS[self.modality]
+            self.parts.insert(0, TextPart(text=modality_token))
+    def append(
+        self: "ContentSequence",
+        part_or_parts: Union[BasePart, List[BasePart]],
+        add_end: bool = False,
+        speaker: Union[str, int] | None = None,
+    ):
+        """
+        Append a part or list of parts to the sequence.
+        Args:
+            part_or_parts: A single part or list of parts to add
+            add_end: Whether to add the IM_END_TOKEN after these parts
+            speaker: Optional speaker identifier (name or ID) to add before the parts
+        """
+        # Convert single part to list
+        parts_to_add = (
+            [part_or_parts] if not isinstance(part_or_parts, list) else part_or_parts
+        )
+        # Add speaker token if specified
+        if speaker is not None:
+            speaker_token = f"<|speaker:{speaker}|>"
+            self.parts.append(TextPart(text=speaker_token))
+        # Add all the parts
+        self.parts.extend(parts_to_add)
+        # Add end token if requested
+        if add_end:
+            self.parts.append(
+                TextPart(text=IM_END_TOKEN, cal_loss=self.parts[-1].cal_loss)
+            )
+    def encode(
+        self: "ContentSequence",
+        tokenizer: FishTokenizer,
+        add_shift: bool = True,
+        ignore_loss_tokens: list[str] = [],
+    ) -> EncodedMessage:
+        """
+        Encode the sequence parts into tokens for the model.
+        Args:
+            tokenizer: The tokenizer to use
+            add_shift: Whether to shift tokens for next-token prediction
+            ignore_loss_tokens: List of token strings to ignore when calculating loss
+        Returns:
+            EncodedMessage with tensors ready for the model
+        """
+        all_tokens = []
+        all_labels = []
+        # Multi-modal elements
+        vq_parts = []
+        vq_masks = []
+        vq_require_losses = []
+        audio_parts = []
+        audio_masks = []
+        ignore_loss_token_ids = [tokenizer.get_token_id(i) for i in ignore_loss_tokens]
+        for part in self.parts:
+            if isinstance(part, TextPart):
+                if part.tokens is None:
+                    assert part.text is not None
+                    tokens = tokenizer.encode(part.text)
+                else:
+                    tokens = part.tokens
+                tokens = torch.tensor(tokens, dtype=torch.int)
+            elif isinstance(part, VQPart):
+                curr_codes = part.codes.clone().to(torch.int)
+                tokens = torch.tensor(
+                    [
+                        tokenizer.semantic_id_to_token_id[int(i.item())]
+                        for i in curr_codes[0].int()
+                    ],
+                    dtype=torch.int,
+                )
+                vq_parts.append(curr_codes)
+                vq_require_losses.append(part.cal_loss)
+            else:
+                raise ValueError(f"Unsupported part type: {type(part)}")
+            all_tokens.append(tokens)
+            # Set masks for different part types
+            if isinstance(part, VQPart):
+                vq_masks.append(torch.ones_like(tokens, dtype=torch.bool))
+                audio_masks.append(torch.zeros_like(tokens, dtype=torch.bool))
+            elif isinstance(part, AudioPart):
+                vq_masks.append(torch.zeros_like(tokens, dtype=torch.bool))
+                audio_mask = torch.ones_like(tokens, dtype=torch.bool)
+                audio_mask[0] = False  # Skip start token
+                audio_mask[-1] = False  # Skip end token
+                audio_masks.append(audio_mask)
+            else:
+                vq_masks.append(torch.zeros_like(tokens, dtype=torch.bool))
+                audio_masks.append(torch.zeros_like(tokens, dtype=torch.bool))
+            # Set labels based on whether we want to calculate loss for this part
+            if part.cal_loss and not isinstance(part, AudioPart):
+                all_labels.append(tokens.clone())
+            else:
+                all_labels.append(torch.full_like(tokens, -100))
+        # Concatenate all tensors
+        tokens = torch.cat(all_tokens, dim=0)
+        labels = torch.cat(all_labels, dim=0)
+        vq_masks = torch.cat(vq_masks, dim=0)
+        audio_masks = torch.cat(audio_masks, dim=0)
+        vq_require_losses = torch.tensor(vq_require_losses, dtype=torch.bool)
+        # Apply shift if needed for next-token prediction
+        vq_mask_tokens = vq_masks
+        vq_mask_labels = vq_masks
+        if add_shift:
+            tokens = tokens[:-1]
+            labels = labels[1:]
+            vq_masks = vq_masks[:-1]
+            vq_mask_tokens = vq_mask_tokens[:-1]
+            vq_mask_labels = vq_mask_labels[1:]
+            audio_masks = audio_masks[:-1]
+        # Ignore specified tokens
+        for i in ignore_loss_token_ids:
+            assert i != -100 and i is not None
+            labels[labels == i] = -100
+        assert tokens.dtype in [
+            torch.int,
+            torch.long,
+        ], f"Invalid dtype: {tokens.dtype}"
+        return EncodedMessage(
+            tokens=tokens,
+            labels=labels,
+            vq_parts=vq_parts,
+            vq_mask_tokens=vq_mask_tokens,
+            vq_mask_labels=vq_mask_labels,
+            vq_require_losses=vq_require_losses,
+            audio_parts=audio_parts,
+            audio_masks=audio_masks,
+            metadata=self.metadata,
+        )
+    def encode_for_inference(
+        self: "ContentSequence",
+        tokenizer: FishTokenizer,
+        num_codebooks: int,
+    ) -> torch.Tensor:
+        encoded = self.encode(tokenizer, add_shift=False)
+        tokens = encoded.tokens
+        values = torch.zeros((num_codebooks + 1, len(tokens)), dtype=torch.int)
+        values[0] = tokens
+        if (encoded.vq_parts is None or len(encoded.vq_parts) == 0) and (
+            encoded.audio_parts is None or len(encoded.audio_parts) == 0
+        ):
+            return values
+        if encoded.vq_parts is not None and len(encoded.vq_parts) > 0:
+            vq_parts = encoded.vq_parts
+            vq_parts = torch.cat(vq_parts, dim=1)
+            values[0, encoded.vq_mask_tokens] = (
+                vq_parts[0] + tokenizer.semantic_begin_id
+            )
+            values[1:, encoded.vq_mask_tokens] = vq_parts
+        return values
+    def visualize(
+        self: "ContentSequence",
+        tokenizer: FishTokenizer,
+        ignore_loss_tokens: list[str] = [],
+        merge_semantic_tokens: bool = False,
+    ):
+        """
+        Visualize the encoded sequence with color-coded tokens.
+        Blue/cyan tokens contribute to loss, green tokens do not.
+        """
+        encoded = self.encode(
+            tokenizer, add_shift=False, ignore_loss_tokens=ignore_loss_tokens
+        )
+        # Colors for alternating tokens
+        colors = {
+            "blue": "\033[94m",  # Light blue
+            "cyan": "\033[96m",  # Cyan
+            "green": "\033[92m",  # Light green
+            "dark_green": "\033[32m",  # Dark green
+        }
+        blue_idx = 0
+        green_idx = 0
+        def print_in_blue(x):
+            nonlocal blue_idx
+            color = colors["blue"] if blue_idx % 2 == 0 else colors["cyan"]
+            print(f"{color}{x}\033[0m", end="")
+            blue_idx += 1
+        def print_in_green(x):
+            nonlocal green_idx
+            color = colors["green"] if green_idx % 2 == 0 else colors["dark_green"]
+            print(f"{color}{x}\033[0m", end="")
+            green_idx += 1
+        def print_semantic_token(x, count):
+            val = f"[<|semantic|>x{count}]"
+            if x == -100:
+                print_in_green(val)
+            else:
+                print_in_blue(val)
+        count_semantic_tokens = 0
+        semantic_label = None
+        for tok, lab in zip(encoded.tokens, encoded.labels):
+            token_id = int(tok.item())
+            if merge_semantic_tokens:
+                if (
+                    tokenizer.semantic_begin_id <= token_id <= tokenizer.semantic_end_id
+                    and (semantic_label is None or semantic_label == lab)
+                ):
+                    count_semantic_tokens += 1
+                    semantic_label = lab
+                    continue
+                elif count_semantic_tokens > 0:
+                    print_semantic_token(semantic_label, count_semantic_tokens)
+                    count_semantic_tokens = 0
+                    semantic_label = None
+            val = tokenizer.decode([int(tok.item())])
+            if lab == -100:
+                print_in_green(val)
+            else:
+                print_in_blue(val)
+        if merge_semantic_tokens and count_semantic_tokens > 0:
+            print_semantic_token(semantic_label, count_semantic_tokens)
+        print()

fish_speech/i18n/README.md CHANGED Viewed

@@ -1,27 +1,27 @@
-## i18n Folder Attribution
-The `i18n` folder within the `fish_speech` directory contains files initially sourced from the RVC project. In compliance with the MIT license under which these files were released, we acknowledge the original authors and sources below:
-### fish_speech/i18n/core.py
-**Related code from RVC:**
-[https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI/blob/83d6a64e675d9bbd6e92ee450c5f807ed2bb54d8/i18n/i18n.py](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI/blob/83d6a64e675d9bbd6e92ee450c5f807ed2bb54d8/i18n/i18n.py)
-**Initial commit:**
-add localization(添加本地化) [RVC-Project/Retrieval-based-Voice-Conversion-WebUI#35](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI/pull/35)
-**Initial author:**
-[@L4Ph](https://github.com/L4Ph)
-### fish_speech/i18n/scan.py
-**Related code from RVC:**
-[https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI/blob/83d6a64e675d9bbd6e92ee450c5f807ed2bb54d8/i18n/scan_i18n.py](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI/blob/83d6a64e675d9bbd6e92ee450c5f807ed2bb54d8/i18n/scan_i18n.py)
-**Initial commit:**
-File for detecting i18n missing keys [RVC-Project/Retrieval-based-Voice-Conversion-WebUI#1058](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI/pull/1058)
-**Initial author:**
-[@towzeur](https://github.com/towzeur)
-We appreciate the contributions of the RVC project and its authors.

+## i18n Folder Attribution
+The `i18n` folder within the `fish_speech` directory contains files initially sourced from the RVC project. In compliance with the MIT license under which these files were released, we acknowledge the original authors and sources below:
+### fish_speech/i18n/core.py
+**Related code from RVC:**
+[https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI/blob/83d6a64e675d9bbd6e92ee450c5f807ed2bb54d8/i18n/i18n.py](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI/blob/83d6a64e675d9bbd6e92ee450c5f807ed2bb54d8/i18n/i18n.py)
+**Initial commit:**
+add localization(添加本地化) [RVC-Project/Retrieval-based-Voice-Conversion-WebUI#35](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI/pull/35)
+**Initial author:**
+[@L4Ph](https://github.com/L4Ph)
+### fish_speech/i18n/scan.py
+**Related code from RVC:**
+[https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI/blob/83d6a64e675d9bbd6e92ee450c5f807ed2bb54d8/i18n/scan_i18n.py](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI/blob/83d6a64e675d9bbd6e92ee450c5f807ed2bb54d8/i18n/scan_i18n.py)
+**Initial commit:**
+File for detecting i18n missing keys [RVC-Project/Retrieval-based-Voice-Conversion-WebUI#1058](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI/pull/1058)
+**Initial author:**
+[@towzeur](https://github.com/towzeur)
+We appreciate the contributions of the RVC project and its authors.

fish_speech/i18n/__init__.py CHANGED Viewed

@@ -1,3 +1,3 @@
-from .core import i18n
-__all__ = ["i18n"]


1	+ from .core import i18n
2	+
3	+ __all__ = ["i18n"]

fish_speech/i18n/core.py CHANGED Viewed

@@ -1,40 +1,40 @@
-import json
-import locale
-from pathlib import Path
-I18N_FILE_PATH = Path(__file__).parent / "locale"
-DEFAULT_LANGUAGE = "en_US"
-def load_language_list(language):
-    with open(I18N_FILE_PATH / f"{language}.json", "r", encoding="utf-8") as f:
-        language_list = json.load(f)
-    return language_list
-class I18nAuto:
-    def __init__(self):
-        i18n_file = Path(".locale")
-        if i18n_file.exists():
-            with open(i18n_file, "r", encoding="utf-8") as f:
-                language = f.read().strip()
-        else:
-            # getlocale can't identify the system's language ((None, None))
-            language = locale.getdefaultlocale()[0]
-        if (I18N_FILE_PATH / f"{language}.json").exists() is False:
-            language = DEFAULT_LANGUAGE
-        self.language = language
-        self.language_map = load_language_list(language)
-    def __call__(self, key):
-        return self.language_map.get(key, key)
-    def __repr__(self):
-        return "Use Language: " + self.language
-i18n = I18nAuto()

+import json
+import locale
+from pathlib import Path
+I18N_FILE_PATH = Path(__file__).parent / "locale"
+DEFAULT_LANGUAGE = "en_US"
+def load_language_list(language):
+    with open(I18N_FILE_PATH / f"{language}.json", "r", encoding="utf-8") as f:
+        language_list = json.load(f)
+    return language_list
+class I18nAuto:
+    def __init__(self):
+        i18n_file = Path(".locale")
+        if i18n_file.exists():
+            with open(i18n_file, "r", encoding="utf-8") as f:
+                language = f.read().strip()
+        else:
+            # getlocale can't identify the system's language ((None, None))
+            language = locale.getdefaultlocale()[0]
+        if (I18N_FILE_PATH / f"{language}.json").exists() is False:
+            language = DEFAULT_LANGUAGE
+        self.language = language
+        self.language_map = load_language_list(language)
+    def __call__(self, key):
+        return self.language_map.get(key, key)
+    def __repr__(self):
+        return "Use Language: " + self.language
+i18n = I18nAuto()

fish_speech/i18n/locale/en_US.json CHANGED Viewed

@@ -1,123 +1,123 @@
-{
-  "16-mixed is recommended for 10+ series GPU": "16-mixed is recommended for 10+ series GPU",
-  "5 to 10 seconds of reference audio, useful for specifying speaker.": "5 to 10 seconds of reference audio, useful for specifying speaker.",
-  "A text-to-speech model based on VQ-GAN and Llama developed by [Fish Audio](https://fish.audio).": "A text-to-speech model based on VQ-GAN and Llama developed by [Fish Audio](https://fish.audio).",
-  "Accumulate Gradient Batches": "Accumulate Gradient Batches",
-  "Add to Processing Area": "Add to Processing Area",
-  "Added path successfully!": "Added path successfully!",
-  "Advanced Config": "Advanced Config",
-  "Base LLAMA Model": "Base LLAMA Model",
-  "Batch Inference": "Batch Inference",
-  "Batch Size": "Batch Size",
-  "Changing with the Model Path": "Changing with the Model Path",
-  "Chinese": "Chinese",
-  "Compile Model": "Compile Model",
-  "Compile the model can significantly reduce the inference time, but will increase cold start time": "Compile the model can significantly reduce the inference time, but will increase cold start time",
-  "Copy": "Copy",
-  "Data Preprocessing": "Data Preprocessing",
-  "Data Preprocessing Path": "Data Preprocessing Path",
-  "Data Source": "Data Source",
-  "Decoder Model Config": "Decoder Model Config",
-  "Decoder Model Path": "Decoder Model Path",
-  "Disabled": "Disabled",
-  "Enable Reference Audio": "Enable Reference Audio",
-  "English": "English",
-  "Error Message": "Error Message",
-  "File Preprocessing": "File Preprocessing",
-  "Generate": "Generate",
-  "Generated Audio": "Generated Audio",
-  "If there is no corresponding text for the audio, apply ASR for assistance, support .txt or .lab format": "If there is no corresponding text for the audio, apply ASR for assistance, support .txt or .lab format",
-  "Infer interface is closed": "Infer interface is closed",
-  "Inference Configuration": "Inference Configuration",
-  "Inference Server Configuration": "Inference Server Configuration",
-  "Inference Server Error": "Inference Server Error",
-  "Inferring interface is launched at {}": "Inferring interface is launched at {}",
-  "Initial Learning Rate": "Initial Learning Rate",
-  "Input Audio & Source Path for Transcription": "Input Audio & Source Path for Transcription",
-  "Input Text": "Input Text",
-  "Invalid path: {}": "Invalid path: {}",
-  "It is recommended to use CUDA, if you have low configuration, use CPU": "It is recommended to use CUDA, if you have low configuration, use CPU",
-  "Iterative Prompt Length, 0 means off": "Iterative Prompt Length, 0 means off",
-  "Japanese": "Japanese",
-  "LLAMA Configuration": "LLAMA Configuration",
-  "LLAMA Model Config": "LLAMA Model Config",
-  "LLAMA Model Path": "LLAMA Model Path",
-  "Labeling Device": "Labeling Device",
-  "LoRA Model to be merged": "LoRA Model to be merged",
-  "Maximum Audio Duration": "Maximum Audio Duration",
-  "Maximum Length per Sample": "Maximum Length per Sample",
-  "Maximum Training Steps": "Maximum Training Steps",
-  "Maximum tokens per batch, 0 means no limit": "Maximum tokens per batch, 0 means no limit",
-  "Merge": "Merge",
-  "Merge LoRA": "Merge LoRA",
-  "Merge successfully": "Merge successfully",
-  "Minimum Audio Duration": "Minimum Audio Duration",
-  "Model Output Path": "Model Output Path",
-  "Model Size": "Model Size",
-  "Move": "Move",
-  "Move files successfully": "Move files successfully",
-  "No audio generated, please check the input text.": "No audio generated, please check the input text.",
-  "No selected options": "No selected options",
-  "Number of Workers": "Number of Workers",
-  "Open Inference Server": "Open Inference Server",
-  "Open Labeler WebUI": "Open Labeler WebUI",
-  "Open Tensorboard": "Open Tensorboard",
-  "Opened labeler in browser": "Opened labeler in browser",
-  "Optional Label Language": "Optional Label Language",
-  "Optional online ver": "Optional online ver",
-  "Output Path": "Output Path",
-  "Path error, please check the model file exists in the corresponding path": "Path error, please check the model file exists in the corresponding path",
-  "Precision": "Precision",
-  "Probability of applying Speaker Condition": "Probability of applying Speaker Condition",
-  "Put your text here.": "Put your text here.",
-  "Reference Audio": "Reference Audio",
-  "Reference Text": "Reference Text",
-  "Related code and weights are released under CC BY-NC-SA 4.0 License.": "Related code and weights are released under CC BY-NC-SA 4.0 License.",
-  "Remove Selected Data": "Remove Selected Data",
-  "Removed path successfully!": "Removed path successfully!",
-  "Repetition Penalty": "Repetition Penalty",
-  "Save model every n steps": "Save model every n steps",
-  "Select LLAMA ckpt": "Select LLAMA ckpt",
-  "Select VITS ckpt": "Select VITS ckpt",
-  "Select VQGAN ckpt": "Select VQGAN ckpt",
-  "Select source file processing method": "Select source file processing method",
-  "Select the model to be trained (Depending on the Tab page you are on)": "Select the model to be trained (Depending on the Tab page you are on)",
-  "Selected: {}": "Selected: {}",
-  "Speaker": "Speaker",
-  "Speaker is identified by the folder name": "Speaker is identified by the folder name",
-  "Start Training": "Start Training",
-  "Streaming Audio": "Streaming Audio",
-  "Streaming Generate": "Streaming Generate",
-  "Tensorboard Host": "Tensorboard Host",
-  "Tensorboard Log Path": "Tensorboard Log Path",
-  "Tensorboard Port": "Tensorboard Port",
-  "Tensorboard interface is closed": "Tensorboard interface is closed",
-  "Tensorboard interface is launched at {}": "Tensorboard interface is launched at {}",
-  "Text is too long, please keep it under {} characters.": "Text is too long, please keep it under {} characters.",
-  "The path of the input folder on the left or the filelist. Whether checked or not, it will be used for subsequent training in this list.": "The path of the input folder on the left or the filelist. Whether checked or not, it will be used for subsequent training in this list.",
-  "Training Configuration": "Training Configuration",
-  "Training Error": "Training Error",
-  "Training stopped": "Training stopped",
-  "Type name of the speaker": "Type name of the speaker",
-  "Type the path or select from the dropdown": "Type the path or select from the dropdown",
-  "Use LoRA": "Use LoRA",
-  "Use LoRA can save GPU memory, but may reduce the quality of the model": "Use LoRA can save GPU memory, but may reduce the quality of the model",
-  "Use filelist": "Use filelist",
-  "Use large for 10G+ GPU, medium for 5G, small for 2G": "Use large for 10G+ GPU, medium for 5G, small for 2G",
-  "VITS Configuration": "VITS Configuration",
-  "VQGAN Configuration": "VQGAN Configuration",
-  "Validation Batch Size": "Validation Batch Size",
-  "View the status of the preprocessing folder (use the slider to control the depth of the tree)": "View the status of the preprocessing folder (use the slider to control the depth of the tree)",
-  "We are not responsible for any misuse of the model, please consider your local laws and regulations before using it.": "We are not responsible for any misuse of the model, please consider your local laws and regulations before using it.",
-  "WebUI Host": "WebUI Host",
-  "WebUI Port": "WebUI Port",
-  "Whisper Model": "Whisper Model",
-  "You can find the source code [here](https://github.com/fishaudio/fish-speech) and models [here](https://huggingface.co/fishaudio/fish-speech-1).": "You can find the source code [here](https://github.com/fishaudio/fish-speech) and models [here](https://huggingface.co/fishaudio/fish-speech-1).",
-  "bf16-true is recommended for 30+ series GPU, 16-mixed is recommended for 10+ series GPU": "bf16-true is recommended for 30+ series GPU, 16-mixed is recommended for 10+ series GPU",
-  "latest": "latest",
-  "new": "new",
-  "Realtime Transform Text": "Realtime Transform Text",
-  "Normalization Result Preview (Currently Only Chinese)": "Normalization Result Preview (Currently Only Chinese)",
-  "Text Normalization": "Text Normalization",
-  "Select Example Audio": "Select Example Audio"
-}

+{
+  "16-mixed is recommended for 10+ series GPU": "16-mixed is recommended for 10+ series GPU",
+  "5 to 10 seconds of reference audio, useful for specifying speaker.": "5 to 10 seconds of reference audio, useful for specifying speaker.",
+  "A text-to-speech model based on VQ-GAN and Llama developed by [Fish Audio](https://fish.audio).": "A text-to-speech model based on VQ-GAN and Llama developed by [Fish Audio](https://fish.audio).",
+  "Accumulate Gradient Batches": "Accumulate Gradient Batches",
+  "Add to Processing Area": "Add to Processing Area",
+  "Added path successfully!": "Added path successfully!",
+  "Advanced Config": "Advanced Config",
+  "Base LLAMA Model": "Base LLAMA Model",
+  "Batch Inference": "Batch Inference",
+  "Batch Size": "Batch Size",
+  "Changing with the Model Path": "Changing with the Model Path",
+  "Chinese": "Chinese",
+  "Compile Model": "Compile Model",
+  "Compile the model can significantly reduce the inference time, but will increase cold start time": "Compile the model can significantly reduce the inference time, but will increase cold start time",
+  "Copy": "Copy",
+  "Data Preprocessing": "Data Preprocessing",
+  "Data Preprocessing Path": "Data Preprocessing Path",
+  "Data Source": "Data Source",
+  "Decoder Model Config": "Decoder Model Config",
+  "Decoder Model Path": "Decoder Model Path",
+  "Disabled": "Disabled",
+  "Enable Reference Audio": "Enable Reference Audio",
+  "English": "English",
+  "Error Message": "Error Message",
+  "File Preprocessing": "File Preprocessing",
+  "Generate": "Generate",
+  "Generated Audio": "Generated Audio",
+  "If there is no corresponding text for the audio, apply ASR for assistance, support .txt or .lab format": "If there is no corresponding text for the audio, apply ASR for assistance, support .txt or .lab format",
+  "Infer interface is closed": "Infer interface is closed",
+  "Inference Configuration": "Inference Configuration",
+  "Inference Server Configuration": "Inference Server Configuration",
+  "Inference Server Error": "Inference Server Error",
+  "Inferring interface is launched at {}": "Inferring interface is launched at {}",
+  "Initial Learning Rate": "Initial Learning Rate",
+  "Input Audio & Source Path for Transcription": "Input Audio & Source Path for Transcription",
+  "Input Text": "Input Text",
+  "Invalid path: {}": "Invalid path: {}",
+  "It is recommended to use CUDA, if you have low configuration, use CPU": "It is recommended to use CUDA, if you have low configuration, use CPU",
+  "Iterative Prompt Length, 0 means off": "Iterative Prompt Length, 0 means off",
+  "Japanese": "Japanese",
+  "LLAMA Configuration": "LLAMA Configuration",
+  "LLAMA Model Config": "LLAMA Model Config",
+  "LLAMA Model Path": "LLAMA Model Path",
+  "Labeling Device": "Labeling Device",
+  "LoRA Model to be merged": "LoRA Model to be merged",
+  "Maximum Audio Duration": "Maximum Audio Duration",
+  "Maximum Length per Sample": "Maximum Length per Sample",
+  "Maximum Training Steps": "Maximum Training Steps",
+  "Maximum tokens per batch, 0 means no limit": "Maximum tokens per batch, 0 means no limit",
+  "Merge": "Merge",
+  "Merge LoRA": "Merge LoRA",
+  "Merge successfully": "Merge successfully",
+  "Minimum Audio Duration": "Minimum Audio Duration",
+  "Model Output Path": "Model Output Path",
+  "Model Size": "Model Size",
+  "Move": "Move",
+  "Move files successfully": "Move files successfully",
+  "No audio generated, please check the input text.": "No audio generated, please check the input text.",
+  "No selected options": "No selected options",
+  "Number of Workers": "Number of Workers",
+  "Open Inference Server": "Open Inference Server",
+  "Open Labeler WebUI": "Open Labeler WebUI",
+  "Open Tensorboard": "Open Tensorboard",
+  "Opened labeler in browser": "Opened labeler in browser",
+  "Optional Label Language": "Optional Label Language",
+  "Optional online ver": "Optional online ver",
+  "Output Path": "Output Path",
+  "Path error, please check the model file exists in the corresponding path": "Path error, please check the model file exists in the corresponding path",
+  "Precision": "Precision",
+  "Probability of applying Speaker Condition": "Probability of applying Speaker Condition",
+  "Put your text here.": "Put your text here.",
+  "Reference Audio": "Reference Audio",
+  "Reference Text": "Reference Text",
+  "Related code and weights are released under CC BY-NC-SA 4.0 License.": "Related code and weights are released under CC BY-NC-SA 4.0 License.",
+  "Remove Selected Data": "Remove Selected Data",
+  "Removed path successfully!": "Removed path successfully!",
+  "Repetition Penalty": "Repetition Penalty",
+  "Save model every n steps": "Save model every n steps",
+  "Select LLAMA ckpt": "Select LLAMA ckpt",
+  "Select VITS ckpt": "Select VITS ckpt",
+  "Select VQGAN ckpt": "Select VQGAN ckpt",
+  "Select source file processing method": "Select source file processing method",
+  "Select the model to be trained (Depending on the Tab page you are on)": "Select the model to be trained (Depending on the Tab page you are on)",
+  "Selected: {}": "Selected: {}",
+  "Speaker": "Speaker",
+  "Speaker is identified by the folder name": "Speaker is identified by the folder name",
+  "Start Training": "Start Training",
+  "Streaming Audio": "Streaming Audio",
+  "Streaming Generate": "Streaming Generate",
+  "Tensorboard Host": "Tensorboard Host",
+  "Tensorboard Log Path": "Tensorboard Log Path",
+  "Tensorboard Port": "Tensorboard Port",
+  "Tensorboard interface is closed": "Tensorboard interface is closed",
+  "Tensorboard interface is launched at {}": "Tensorboard interface is launched at {}",
+  "Text is too long, please keep it under {} characters.": "Text is too long, please keep it under {} characters.",
+  "The path of the input folder on the left or the filelist. Whether checked or not, it will be used for subsequent training in this list.": "The path of the input folder on the left or the filelist. Whether checked or not, it will be used for subsequent training in this list.",
+  "Training Configuration": "Training Configuration",
+  "Training Error": "Training Error",
+  "Training stopped": "Training stopped",
+  "Type name of the speaker": "Type name of the speaker",
+  "Type the path or select from the dropdown": "Type the path or select from the dropdown",
+  "Use LoRA": "Use LoRA",
+  "Use LoRA can save GPU memory, but may reduce the quality of the model": "Use LoRA can save GPU memory, but may reduce the quality of the model",
+  "Use filelist": "Use filelist",
+  "Use large for 10G+ GPU, medium for 5G, small for 2G": "Use large for 10G+ GPU, medium for 5G, small for 2G",
+  "VITS Configuration": "VITS Configuration",
+  "VQGAN Configuration": "VQGAN Configuration",
+  "Validation Batch Size": "Validation Batch Size",
+  "View the status of the preprocessing folder (use the slider to control the depth of the tree)": "View the status of the preprocessing folder (use the slider to control the depth of the tree)",
+  "We are not responsible for any misuse of the model, please consider your local laws and regulations before using it.": "We are not responsible for any misuse of the model, please consider your local laws and regulations before using it.",
+  "WebUI Host": "WebUI Host",
+  "WebUI Port": "WebUI Port",
+  "Whisper Model": "Whisper Model",
+  "You can find the source code [here](https://github.com/fishaudio/fish-speech) and models [here](https://huggingface.co/fishaudio/fish-speech-1).": "You can find the source code [here](https://github.com/fishaudio/fish-speech) and models [here](https://huggingface.co/fishaudio/fish-speech-1).",
+  "bf16-true is recommended for 30+ series GPU, 16-mixed is recommended for 10+ series GPU": "bf16-true is recommended for 30+ series GPU, 16-mixed is recommended for 10+ series GPU",
+  "latest": "latest",
+  "new": "new",
+  "Realtime Transform Text": "Realtime Transform Text",
+  "Normalization Result Preview (Currently Only Chinese)": "Normalization Result Preview (Currently Only Chinese)",
+  "Text Normalization": "Text Normalization",
+  "Select Example Audio": "Select Example Audio"
+}

fish_speech/i18n/locale/es_ES.json CHANGED Viewed

@@ -1,123 +1,123 @@
-{
-  "16-mixed is recommended for 10+ series GPU": "se recomienda 16-mixed para GPU de la serie 10+",
-  "5 to 10 seconds of reference audio, useful for specifying speaker.": "5 a 10 segundos de audio de referencia, útil para especificar el hablante.",
-  "A text-to-speech model based on VQ-GAN and Llama developed by [Fish Audio](https://fish.audio).": "Un modelo de texto a voz basado en VQ-GAN y Llama desarrollado por [Fish Audio](https://fish.audio).",
-  "Accumulate Gradient Batches": "Acumular lotes de gradientes",
-  "Add to Processing Area": "Agregar al Área de Procesamiento",
-  "Added path successfully!": "¡Ruta agregada exitosamente!",
-  "Advanced Config": "Configuración Avanzada",
-  "Base LLAMA Model": "Modelo Base LLAMA",
-  "Batch Inference": "Inferencia por Lote",
-  "Batch Size": "Tamaño del Lote",
-  "Changing with the Model Path": "Cambiando con la Ruta del Modelo",
-  "Chinese": "Chino",
-  "Compile Model": "Compilar Modelo",
-  "Compile the model can significantly reduce the inference time, but will increase cold start time": "Compilar el modelo puede reducir significativamente el tiempo de inferencia, pero aumentará el tiempo de inicio en frío",
-  "Copy": "Copiar",
-  "Data Preprocessing": "Preprocesamiento de Datos",
-  "Data Preprocessing Path": "Ruta de Preprocesamiento de Datos",
-  "Data Source": "Fuente de Datos",
-  "Decoder Model Config": "Configuración del modelo decodificador",
-  "Decoder Model Path": "Ruta del modelo decodificador",
-  "Disabled": "Desactivado",
-  "Enable Reference Audio": "Habilitar Audio de Referencia",
-  "English": "Inglés",
-  "Error Message": "Mensaje de Error",
-  "File Preprocessing": "Preprocesamiento de Archivos",
-  "Generate": "Generar",
-  "Generated Audio": "Audio Generado",
-  "If there is no corresponding text for the audio, apply ASR for assistance, support .txt or .lab format": "Si no hay texto correspondiente para el audio, aplique ASR para asistencia, soporte para formato .txt o .lab",
-  "Infer interface is closed": "La interfaz de inferencia está cerrada",
-  "Inference Configuration": "Configuración de Inferencia",
-  "Inference Server Configuration": "Configuración del Servidor de Inferencia",
-  "Inference Server Error": "Error del Servidor de Inferencia",
-  "Inferring interface is launched at {}": "La interfaz de inferencia se ha lanzado en {}",
-  "Initial Learning Rate": "Tasa de Aprendizaje Inicial",
-  "Input Audio & Source Path for Transcription": "Audio de Entrada y Ruta de Origen para Transcripción",
-  "Input Text": "Texto de Entrada",
-  "Invalid path: {}": "Ruta inválida: {}",
-  "It is recommended to use CUDA, if you have low configuration, use CPU": "Se recomienda usar CUDA, si tiene una configuración baja, use CPU",
-  "Iterative Prompt Length, 0 means off": "Longitud de la Indicación Iterativa, 0 significa apagado",
-  "Japanese": "Japonés",
-  "LLAMA Configuration": "Configuración de LLAMA",
-  "LLAMA Model Config": "Configuración del Modelo LLAMA",
-  "LLAMA Model Path": "Ruta del Modelo LLAMA",
-  "Labeling Device": "Dispositivo de Etiquetado",
-  "LoRA Model to be merged": "Modelo LoRA a fusionar",
-  "Maximum Audio Duration": "Duración máxima de audio",
-  "Maximum Length per Sample": "Longitud Máxima por Muestra",
-  "Maximum Training Steps": "Pasos Máximos de Entrenamiento",
-  "Maximum tokens per batch, 0 means no limit": "Máximo de tokens por lote, 0 significa sin límite",
-  "Merge": "Fusionar",
-  "Merge LoRA": "Fusionar LoRA",
-  "Merge successfully": "Fusionado exitosamente",
-  "Minimum Audio Duration": "Duración mínima de audio",
-  "Model Output Path": "Ruta de Salida del Modelo",
-  "Model Size": "Tamaño del Modelo",
-  "Move": "Mover",
-  "Move files successfully": "Archivos movidos exitosamente",
-  "No audio generated, please check the input text.": "No se generó audio, por favor verifique el texto de entrada.",
-  "No selected options": "No hay opciones seleccionadas",
-  "Number of Workers": "Número de Trabajadores",
-  "Open Inference Server": "Abrir Servidor de Inferencia",
-  "Open Labeler WebUI": "Abrir Interfaz Web del Etiquetador",
-  "Open Tensorboard": "Abrir Tensorboard",
-  "Opened labeler in browser": "Se abrió el etiquetador en el navegador",
-  "Optional Label Language": "Idioma de Etiquetado Opcional",
-  "Optional online ver": "Ver en línea opcional",
-  "Output Path": "Ruta de Salida",
-  "Path error, please check the model file exists in the corresponding path": "Error de ruta, por favor verifique que el archivo del modelo exista en la ruta correspondiente",
-  "Precision": "Precisión",
-  "Probability of applying Speaker Condition": "Probabilidad de aplicar Condición de Hablante",
-  "Put your text here.": "Ponga su texto aquí.",
-  "Reference Audio": "Audio de Referencia",
-  "Reference Text": "Texto de Referencia",
-  "Related code and weights are released under CC BY-NC-SA 4.0 License.": "El código relacionado y los pesos se publican bajo la Licencia CC BY-NC-SA 4.0.",
-  "Remove Selected Data": "Eliminar Datos Seleccionados",
-  "Removed path successfully!": "¡Ruta eliminada exitosamente!",
-  "Repetition Penalty": "Penalización por Repetición",
-  "Save model every n steps": "Guardar modelo cada n pasos",
-  "Select LLAMA ckpt": "Seleccionar punto de control LLAMA",
-  "Select VITS ckpt": "Seleccionar punto de control VITS",
-  "Select VQGAN ckpt": "Seleccionar punto de control VQGAN",
-  "Select source file processing method": "Seleccione el método de procesamiento de archivos fuente",
-  "Select the model to be trained (Depending on the Tab page you are on)": "Seleccione el modelo a entrenar (Dependiendo de la pestaña en la que se encuentre)",
-  "Selected: {}": "Seleccionado: {}",
-  "Speaker": "Hablante",
-  "Speaker is identified by the folder name": "El hablante se identifica por el nombre de la carpeta",
-  "Start Training": "Iniciar Entrenamiento",
-  "Streaming Audio": "transmisión de audio",
-  "Streaming Generate": "síntesis en flujo",
-  "Tensorboard Host": "Host de Tensorboard",
-  "Tensorboard Log Path": "Ruta de Registro de Tensorboard",
-  "Tensorboard Port": "Puerto de Tensorboard",
-  "Tensorboard interface is closed": "La interfaz de Tensorboard está cerrada",
-  "Tensorboard interface is launched at {}": "La interfaz de Tensorboard se ha lanzado en {}",
-  "Text is too long, please keep it under {} characters.": "El texto es demasiado largo, por favor manténgalo por debajo de {} caracteres.",
-  "The path of the input folder on the left or the filelist. Whether checked or not, it will be used for subsequent training in this list.": "La ruta de la carpeta de entrada a la izquierda o la lista de archivos. Ya sea que esté marcado o no, se utilizará para el entrenamiento posterior en esta lista.",
-  "Training Configuration": "Configuración de Entrenamiento",
-  "Training Error": "Error de Entrenamiento",
-  "Training stopped": "Entrenamiento detenido",
-  "Type name of the speaker": "Escriba el nombre del hablante",
-  "Type the path or select from the dropdown": "Escriba la ruta o seleccione de la lista desplegable",
-  "Use LoRA": "Usar LoRA",
-  "Use LoRA can save GPU memory, but may reduce the quality of the model": "Usar LoRA puede ahorrar memoria GPU, pero puede reducir la calidad del modelo",
-  "Use filelist": "Usar lista de archivos",
-  "Use large for 10G+ GPU, medium for 5G, small for 2G": "Use grande para GPU de 10G+, mediano para 5G, pequeño para 2G",
-  "VITS Configuration": "Configuración de VITS",
-  "VQGAN Configuration": "Configuración de VQGAN",
-  "Validation Batch Size": "Tamaño del Lote de Validación",
-  "View the status of the preprocessing folder (use the slider to control the depth of the tree)": "Vea el estado de la carpeta de preprocesamiento (use el control deslizante para controlar la profundidad del árbol)",
-  "We are not responsible for any misuse of the model, please consider your local laws and regulations before using it.": "No somos responsables de ningún mal uso del modelo, por favor considere sus leyes y regulaciones locales antes de usarlo.",
-  "WebUI Host": "Host de WebUI",
-  "WebUI Port": "Puerto de WebUI",
-  "Whisper Model": "Modelo Whisper",
-  "You can find the source code [here](https://github.com/fishaudio/fish-speech) and models [here](https://huggingface.co/fishaudio/fish-speech-1).": "Puede encontrar el código fuente [aquí](https://github.com/fishaudio/fish-speech) y los modelos [aquí](https://huggingface.co/fishaudio/fish-speech-1).",
-  "bf16-true is recommended for 30+ series GPU, 16-mixed is recommended for 10+ series GPU": "Se recomienda bf16-true para GPU de la serie 30+, se recomienda 16-mixed para GPU de la serie 10+",
-  "latest": "más reciente",
-  "new": "nuevo",
-  "Realtime Transform Text": "Transformación de Texto en Tiempo Real",
-  "Normalization Result Preview (Currently Only Chinese)": "Vista Previa del Resultado de Normalización (Actualmente Solo Chino)",
-  "Text Normalization": "Normalización de Texto",
-  "Select Example Audio": "Selecionar áudio de exemplo"
-}

+{
+  "16-mixed is recommended for 10+ series GPU": "se recomienda 16-mixed para GPU de la serie 10+",
+  "5 to 10 seconds of reference audio, useful for specifying speaker.": "5 a 10 segundos de audio de referencia, útil para especificar el hablante.",
+  "A text-to-speech model based on VQ-GAN and Llama developed by [Fish Audio](https://fish.audio).": "Un modelo de texto a voz basado en VQ-GAN y Llama desarrollado por [Fish Audio](https://fish.audio).",
+  "Accumulate Gradient Batches": "Acumular lotes de gradientes",
+  "Add to Processing Area": "Agregar al Área de Procesamiento",
+  "Added path successfully!": "¡Ruta agregada exitosamente!",
+  "Advanced Config": "Configuración Avanzada",
+  "Base LLAMA Model": "Modelo Base LLAMA",
+  "Batch Inference": "Inferencia por Lote",
+  "Batch Size": "Tamaño del Lote",
+  "Changing with the Model Path": "Cambiando con la Ruta del Modelo",
+  "Chinese": "Chino",
+  "Compile Model": "Compilar Modelo",
+  "Compile the model can significantly reduce the inference time, but will increase cold start time": "Compilar el modelo puede reducir significativamente el tiempo de inferencia, pero aumentará el tiempo de inicio en frío",
+  "Copy": "Copiar",
+  "Data Preprocessing": "Preprocesamiento de Datos",
+  "Data Preprocessing Path": "Ruta de Preprocesamiento de Datos",
+  "Data Source": "Fuente de Datos",
+  "Decoder Model Config": "Configuración del modelo decodificador",
+  "Decoder Model Path": "Ruta del modelo decodificador",
+  "Disabled": "Desactivado",
+  "Enable Reference Audio": "Habilitar Audio de Referencia",
+  "English": "Inglés",
+  "Error Message": "Mensaje de Error",
+  "File Preprocessing": "Preprocesamiento de Archivos",
+  "Generate": "Generar",
+  "Generated Audio": "Audio Generado",
+  "If there is no corresponding text for the audio, apply ASR for assistance, support .txt or .lab format": "Si no hay texto correspondiente para el audio, aplique ASR para asistencia, soporte para formato .txt o .lab",
+  "Infer interface is closed": "La interfaz de inferencia está cerrada",
+  "Inference Configuration": "Configuración de Inferencia",
+  "Inference Server Configuration": "Configuración del Servidor de Inferencia",
+  "Inference Server Error": "Error del Servidor de Inferencia",
+  "Inferring interface is launched at {}": "La interfaz de inferencia se ha lanzado en {}",
+  "Initial Learning Rate": "Tasa de Aprendizaje Inicial",
+  "Input Audio & Source Path for Transcription": "Audio de Entrada y Ruta de Origen para Transcripción",
+  "Input Text": "Texto de Entrada",
+  "Invalid path: {}": "Ruta inválida: {}",
+  "It is recommended to use CUDA, if you have low configuration, use CPU": "Se recomienda usar CUDA, si tiene una configuración baja, use CPU",
+  "Iterative Prompt Length, 0 means off": "Longitud de la Indicación Iterativa, 0 significa apagado",
+  "Japanese": "Japonés",
+  "LLAMA Configuration": "Configuración de LLAMA",
+  "LLAMA Model Config": "Configuración del Modelo LLAMA",
+  "LLAMA Model Path": "Ruta del Modelo LLAMA",
+  "Labeling Device": "Dispositivo de Etiquetado",
+  "LoRA Model to be merged": "Modelo LoRA a fusionar",
+  "Maximum Audio Duration": "Duración máxima de audio",
+  "Maximum Length per Sample": "Longitud Máxima por Muestra",
+  "Maximum Training Steps": "Pasos Máximos de Entrenamiento",
+  "Maximum tokens per batch, 0 means no limit": "Máximo de tokens por lote, 0 significa sin límite",
+  "Merge": "Fusionar",
+  "Merge LoRA": "Fusionar LoRA",
+  "Merge successfully": "Fusionado exitosamente",
+  "Minimum Audio Duration": "Duración mínima de audio",
+  "Model Output Path": "Ruta de Salida del Modelo",
+  "Model Size": "Tamaño del Modelo",
+  "Move": "Mover",
+  "Move files successfully": "Archivos movidos exitosamente",
+  "No audio generated, please check the input text.": "No se generó audio, por favor verifique el texto de entrada.",
+  "No selected options": "No hay opciones seleccionadas",
+  "Number of Workers": "Número de Trabajadores",
+  "Open Inference Server": "Abrir Servidor de Inferencia",
+  "Open Labeler WebUI": "Abrir Interfaz Web del Etiquetador",
+  "Open Tensorboard": "Abrir Tensorboard",
+  "Opened labeler in browser": "Se abrió el etiquetador en el navegador",
+  "Optional Label Language": "Idioma de Etiquetado Opcional",
+  "Optional online ver": "Ver en línea opcional",
+  "Output Path": "Ruta de Salida",
+  "Path error, please check the model file exists in the corresponding path": "Error de ruta, por favor verifique que el archivo del modelo exista en la ruta correspondiente",
+  "Precision": "Precisión",
+  "Probability of applying Speaker Condition": "Probabilidad de aplicar Condición de Hablante",
+  "Put your text here.": "Ponga su texto aquí.",
+  "Reference Audio": "Audio de Referencia",
+  "Reference Text": "Texto de Referencia",
+  "Related code and weights are released under CC BY-NC-SA 4.0 License.": "El código relacionado y los pesos se publican bajo la Licencia CC BY-NC-SA 4.0.",
+  "Remove Selected Data": "Eliminar Datos Seleccionados",
+  "Removed path successfully!": "¡Ruta eliminada exitosamente!",
+  "Repetition Penalty": "Penalización por Repetición",
+  "Save model every n steps": "Guardar modelo cada n pasos",
+  "Select LLAMA ckpt": "Seleccionar punto de control LLAMA",
+  "Select VITS ckpt": "Seleccionar punto de control VITS",
+  "Select VQGAN ckpt": "Seleccionar punto de control VQGAN",
+  "Select source file processing method": "Seleccione el método de procesamiento de archivos fuente",
+  "Select the model to be trained (Depending on the Tab page you are on)": "Seleccione el modelo a entrenar (Dependiendo de la pestaña en la que se encuentre)",
+  "Selected: {}": "Seleccionado: {}",
+  "Speaker": "Hablante",
+  "Speaker is identified by the folder name": "El hablante se identifica por el nombre de la carpeta",
+  "Start Training": "Iniciar Entrenamiento",
+  "Streaming Audio": "transmisión de audio",
+  "Streaming Generate": "síntesis en flujo",
+  "Tensorboard Host": "Host de Tensorboard",
+  "Tensorboard Log Path": "Ruta de Registro de Tensorboard",
+  "Tensorboard Port": "Puerto de Tensorboard",
+  "Tensorboard interface is closed": "La interfaz de Tensorboard está cerrada",
+  "Tensorboard interface is launched at {}": "La interfaz de Tensorboard se ha lanzado en {}",
+  "Text is too long, please keep it under {} characters.": "El texto es demasiado largo, por favor manténgalo por debajo de {} caracteres.",
+  "The path of the input folder on the left or the filelist. Whether checked or not, it will be used for subsequent training in this list.": "La ruta de la carpeta de entrada a la izquierda o la lista de archivos. Ya sea que esté marcado o no, se utilizará para el entrenamiento posterior en esta lista.",
+  "Training Configuration": "Configuración de Entrenamiento",
+  "Training Error": "Error de Entrenamiento",
+  "Training stopped": "Entrenamiento detenido",
+  "Type name of the speaker": "Escriba el nombre del hablante",
+  "Type the path or select from the dropdown": "Escriba la ruta o seleccione de la lista desplegable",
+  "Use LoRA": "Usar LoRA",
+  "Use LoRA can save GPU memory, but may reduce the quality of the model": "Usar LoRA puede ahorrar memoria GPU, pero puede reducir la calidad del modelo",
+  "Use filelist": "Usar lista de archivos",
+  "Use large for 10G+ GPU, medium for 5G, small for 2G": "Use grande para GPU de 10G+, mediano para 5G, pequeño para 2G",
+  "VITS Configuration": "Configuración de VITS",
+  "VQGAN Configuration": "Configuración de VQGAN",
+  "Validation Batch Size": "Tamaño del Lote de Validación",
+  "View the status of the preprocessing folder (use the slider to control the depth of the tree)": "Vea el estado de la carpeta de preprocesamiento (use el control deslizante para controlar la profundidad del árbol)",
+  "We are not responsible for any misuse of the model, please consider your local laws and regulations before using it.": "No somos responsables de ningún mal uso del modelo, por favor considere sus leyes y regulaciones locales antes de usarlo.",
+  "WebUI Host": "Host de WebUI",
+  "WebUI Port": "Puerto de WebUI",
+  "Whisper Model": "Modelo Whisper",
+  "You can find the source code [here](https://github.com/fishaudio/fish-speech) and models [here](https://huggingface.co/fishaudio/fish-speech-1).": "Puede encontrar el código fuente [aquí](https://github.com/fishaudio/fish-speech) y los modelos [aquí](https://huggingface.co/fishaudio/fish-speech-1).",
+  "bf16-true is recommended for 30+ series GPU, 16-mixed is recommended for 10+ series GPU": "Se recomienda bf16-true para GPU de la serie 30+, se recomienda 16-mixed para GPU de la serie 10+",
+  "latest": "más reciente",
+  "new": "nuevo",
+  "Realtime Transform Text": "Transformación de Texto en Tiempo Real",
+  "Normalization Result Preview (Currently Only Chinese)": "Vista Previa del Resultado de Normalización (Actualmente Solo Chino)",
+  "Text Normalization": "Normalización de Texto",
+  "Select Example Audio": "Selecionar áudio de exemplo"
+}

fish_speech/i18n/locale/ja_JP.json CHANGED Viewed

@@ -1,123 +1,123 @@
-{
-  "16-mixed is recommended for 10+ series GPU": "10シリーズ以降のGPUには16-mixedをお勧めします",
-  "5 to 10 seconds of reference audio, useful for specifying speaker.": "話者を指定するのに役立つ、5～10秒のリファレンスオーディオ。",
-  "A text-to-speech model based on VQ-GAN and Llama developed by [Fish Audio](https://fish.audio).": "[Fish Audio](https://fish.audio)が開発したVQ-GANとLlamaに基づくテキスト音声合成モデル。",
-  "Accumulate Gradient Batches": "勾配バッチの累積",
-  "Add to Processing Area": "処理エリアに追加",
-  "Added path successfully!": "パスの追加に成功しました！",
-  "Advanced Config": "詳細設定",
-  "Base LLAMA Model": "基本LLAMAモデル",
-  "Batch Inference": "バッチ推論",
-  "Batch Size": "バッチサイズ",
-  "Changing with the Model Path": "モデルのパスに伴って変化する",
-  "Chinese": "中国語",
-  "Compile Model": "モデルのコンパイル",
-  "Compile the model can significantly reduce the inference time, but will increase cold start time": "モデルをコンパイルすると推論時間を大幅に短縮できますが、コールドスタート時間が長くなります",
-  "Copy": "コピー",
-  "Data Preprocessing": "データ前処理",
-  "Data Preprocessing Path": "データ前処理パス",
-  "Data Source": "データソース",
-  "Decoder Model Config": "デコーダーモデルの構成",
-  "Decoder Model Path": "デコーダーモデルのパス",
-  "Disabled": "無効",
-  "Enable Reference Audio": "リファレンスオーディオを有効にする",
-  "English": "英語",
-  "Error Message": "エラーメッセージ",
-  "File Preprocessing": "文書前处理",
-  "Generate": "生成",
-  "Generated Audio": "生成されたオーディオ",
-  "If there is no corresponding text for the audio, apply ASR for assistance, support .txt or .lab format": "音声に対応するテキストがない場合は、ASRを適用してサポートします。.txtまたは.lab形式をサポートしています",
-  "Infer interface is closed": "推論インターフェースが閉じられています",
-  "Inference Configuration": "推論設定",
-  "Inference Server Configuration": "推論サーバー設定",
-  "Inference Server Error": "推論サーバーエラー",
-  "Inferring interface is launched at {}": "推論インターフェースが{}で起動しました",
-  "Initial Learning Rate": "初期学習率",
-  "Input Audio & Source Path for Transcription": "入力オーディオと文字起こしのソースパス",
-  "Input Text": "入力テキスト",
-  "Invalid path: {}": "無効なパス: {}",
-  "It is recommended to use CUDA, if you have low configuration, use CPU": "CUDAの使用をお勧めします。低い構成の場合はCPUを使用してください",
-  "Iterative Prompt Length, 0 means off": "反復プロンプト長。0はオフを意味します",
-  "Japanese": "日本語",
-  "LLAMA Configuration": "LLAMA設定",
-  "LLAMA Model Config": "LLAMAモデル設定",
-  "LLAMA Model Path": "LLAMAモデルパス",
-  "Labeling Device": "ラベリングデバイス",
-  "LoRA Model to be merged": "マージするLoRAモデル",
-  "Maximum Audio Duration": "最大オーディオの長さ",
-  "Maximum Length per Sample": "サンプルあたりの最大長",
-  "Maximum Training Steps": "最大トレーニングステップ数",
-  "Maximum tokens per batch, 0 means no limit": "バッチあたりの最大トークン数。0は制限なしを意味します",
-  "Merge": "マージ",
-  "Merge LoRA": "LoRAのマージ",
-  "Merge successfully": "マージに成功しました",
-  "Minimum Audio Duration": "最小オーディオの長さ",
-  "Model Output Path": "モデル出力パス",
-  "Model Size": "モデルサイズ",
-  "Move": "移動",
-  "Move files successfully": "ファイルの移動に成功しました",
-  "No audio generated, please check the input text.": "オーディオが生成されていません。入力テキストを確認してください。",
-  "No selected options": "選択されたオプションはありません",
-  "Number of Workers": "ワーカー数",
-  "Open Inference Server": "推論サーバーを開く",
-  "Open Labeler WebUI": "ラベラーWebUIを開く",
-  "Open Tensorboard": "Tensorboardを開く",
-  "Opened labeler in browser": "ブラウザでラベラーを開きました",
-  "Optional Label Language": "オプションのラベル言語",
-  "Optional online ver": "オプションのオンラインバージョン",
-  "Output Path": "出力パス",
-  "Path error, please check the model file exists in the corresponding path": "パスエラー。対応するパスにモデルファイルが存在するか確認してください",
-  "Precision": "精度",
-  "Probability of applying Speaker Condition": "話者条件を適用する確率",
-  "Put your text here.": "ここにテキスト���入力してください。",
-  "Reference Audio": "リファレンスオーディオ",
-  "Reference Text": "リファレンステキスト",
-  "Related code and weights are released under CC BY-NC-SA 4.0 License.": "関連コードと重みはCC BY-NC-SA 4.0ライセンスの下でリリースされます。",
-  "Remove Selected Data": "選択したデータを削除",
-  "Removed path successfully!": "パスの削除に成功しました！",
-  "Repetition Penalty": "反復ペナルティ",
-  "Save model every n steps": "nステップごとにモデルを保存",
-  "Select LLAMA ckpt": " LLAMA チェックポイントを選択",
-  "Select VITS ckpt": "VITS チェックポイントを選択",
-  "Select VQGAN ckpt": "VQGAN チェックポイントを選択",
-  "Select source file processing method": "ソースファイルの処理方法を選択",
-  "Select the model to be trained (Depending on the Tab page you are on)": "タブページに応じてトレーニングするモデルを選択してください",
-  "Selected: {}": "選択済み: {}",
-  "Speaker": "話者",
-  "Speaker is identified by the folder name": "話者はフォルダ名で識別されます",
-  "Start Training": "トレーニング開始",
-  "Streaming Audio": "ストリーミングオーディオ",
-  "Streaming Generate": "ストリーミング合成",
-  "Tensorboard Host": "Tensorboardホスト",
-  "Tensorboard Log Path": "Tensorboardログパス",
-  "Tensorboard Port": "Tensorboardポート",
-  "Tensorboard interface is closed": "Tensorboardインターフェースが閉じられています",
-  "Tensorboard interface is launched at {}": "Tensorboardインターフェースが{}で起動されました",
-  "Text is too long, please keep it under {} characters.": "テキストが長すぎます。{}文字以内に抑えてください。",
-  "The path of the input folder on the left or the filelist. Whether checked or not, it will be used for subsequent training in this list.": "左側の入力フォルダまたはファイルリストのパス。チェックの有無にかかわらず、このリストの後続のトレーニングに使用されます。",
-  "Training Configuration": "トレーニング設定",
-  "Training Error": "トレーニングエラー",
-  "Training stopped": "トレーニングが停止しました",
-  "Type name of the speaker": "話者の名前を入力",
-  "Type the path or select from the dropdown": "パスを入力するか、ドロップダウンから選択してください",
-  "Use LoRA": "LoRAを使用",
-  "Use LoRA can save GPU memory, but may reduce the quality of the model": "LoRAを使用するとGPUメモリを節約できますが、モデルの品質が低下する可能性があります",
-  "Use filelist": "ファイルリストを使用",
-  "Use large for 10G+ GPU, medium for 5G, small for 2G": "10G以上のGPUには大、5Gには中、2Gには小を使用してください",
-  "VITS Configuration": "VITS の構成",
-  "VQGAN Configuration": "VQGAN の構成",
-  "Validation Batch Size": "検証バッチサイズ",
-  "View the status of the preprocessing folder (use the slider to control the depth of the tree)": "前処理フォルダの状態を表示（スライダーを使用してツリーの深さを制御）",
-  "We are not responsible for any misuse of the model, please consider your local laws and regulations before using it.": "モデルの誤用については一切責任を負いません。使用する前に、現地の法律と規制を考慮してください。",
-  "WebUI Host": "WebUIホスト",
-  "WebUI Port": "WebUIポート",
-  "Whisper Model": "Whisperモデル",
-  "You can find the source code [here](https://github.com/fishaudio/fish-speech) and models [here](https://huggingface.co/fishaudio/fish-speech-1).": "ソースコードは[こちら](https://github.com/fishaudio/fish-speech)、モデルは[こちら](https://huggingface.co/fishaudio/fish-speech-1)にあります。",
-  "bf16-true is recommended for 30+ series GPU, 16-mixed is recommended for 10+ series GPU": "30シリーズ以降のGPUにはbf16-trueを、10シリーズ以降のGPUには16-mixedをお勧めします",
-  "latest": "最新",
-  "new": "新規",
-  "Realtime Transform Text": "リアルタイム変換テキスト",
-  "Normalization Result Preview (Currently Only Chinese)": "正規化結果プレビュー（現在は中国語のみ）",
-  "Text Normalization": "テキスト正規化",
-  "Select Example Audio": "サンプル音声を選択"
-}

+{
+  "16-mixed is recommended for 10+ series GPU": "10シリーズ以降のGPUには16-mixedをお勧めします",
+  "5 to 10 seconds of reference audio, useful for specifying speaker.": "話者を指定するのに役立つ、5～10秒のリファレンスオーディオ。",
+  "A text-to-speech model based on VQ-GAN and Llama developed by [Fish Audio](https://fish.audio).": "[Fish Audio](https://fish.audio)が開発したVQ-GANとLlamaに基づくテキスト音声合成��デル。",
+  "Accumulate Gradient Batches": "勾配バッチの累積",
+  "Add to Processing Area": "処理エリアに追加",
+  "Added path successfully!": "パスの追加に成功しました！",
+  "Advanced Config": "詳細設定",
+  "Base LLAMA Model": "基本LLAMAモデル",
+  "Batch Inference": "バッチ推論",
+  "Batch Size": "バッチサイズ",
+  "Changing with the Model Path": "モデルのパスに伴って変化する",
+  "Chinese": "中国語",
+  "Compile Model": "モデルのコンパイル",
+  "Compile the model can significantly reduce the inference time, but will increase cold start time": "モデルをコンパイルすると推論時間を大幅に短縮できますが、コールドスタート時間が長くなります",
+  "Copy": "コピー",
+  "Data Preprocessing": "データ前処理",
+  "Data Preprocessing Path": "データ前処理パス",
+  "Data Source": "データソース",
+  "Decoder Model Config": "デコーダーモデルの構成",
+  "Decoder Model Path": "デコーダーモデルのパス",
+  "Disabled": "無効",
+  "Enable Reference Audio": "リファレンスオーディオを有効にする",
+  "English": "英語",
+  "Error Message": "エラーメッセージ",
+  "File Preprocessing": "文書前处理",
+  "Generate": "生成",
+  "Generated Audio": "生成されたオーディオ",
+  "If there is no corresponding text for the audio, apply ASR for assistance, support .txt or .lab format": "音声に対応するテキストがない場合は、ASRを適用してサポートします。.txtまたは.lab形式をサポートしています",
+  "Infer interface is closed": "推論インターフェースが閉じられています",
+  "Inference Configuration": "推論設定",
+  "Inference Server Configuration": "推論サーバー設定",
+  "Inference Server Error": "推論サーバーエラー",
+  "Inferring interface is launched at {}": "推論インターフェースが{}で起動しました",
+  "Initial Learning Rate": "初期学習率",
+  "Input Audio & Source Path for Transcription": "入力オーディオと文字起こしのソースパス",
+  "Input Text": "入力テキスト",
+  "Invalid path: {}": "無効なパス: {}",
+  "It is recommended to use CUDA, if you have low configuration, use CPU": "CUDAの使用をお勧めします。低い構成の場合はCPUを使用してください",
+  "Iterative Prompt Length, 0 means off": "反復プロンプト長。0はオフを意味します",
+  "Japanese": "日本語",
+  "LLAMA Configuration": "LLAMA設定",
+  "LLAMA Model Config": "LLAMAモデル設定",
+  "LLAMA Model Path": "LLAMAモデルパス",
+  "Labeling Device": "ラベリングデバイス",
+  "LoRA Model to be merged": "マージするLoRAモデル",
+  "Maximum Audio Duration": "最大オーディオの長さ",
+  "Maximum Length per Sample": "サンプルあたりの最大長",
+  "Maximum Training Steps": "最大トレーニングステップ数",
+  "Maximum tokens per batch, 0 means no limit": "バッチあたりの最大トークン数。0は制限なしを意味します",
+  "Merge": "マージ",
+  "Merge LoRA": "LoRAのマージ",
+  "Merge successfully": "マージに成功しました",
+  "Minimum Audio Duration": "最小オーディオの長さ",
+  "Model Output Path": "モデル出力パス",
+  "Model Size": "モデルサイズ",
+  "Move": "移動",
+  "Move files successfully": "ファイルの移動に成功しました",
+  "No audio generated, please check the input text.": "オーディオが生成されていません。入力テキストを確認してください。",
+  "No selected options": "選択されたオプションはありません",
+  "Number of Workers": "ワーカー数",
+  "Open Inference Server": "推論サーバーを開く",
+  "Open Labeler WebUI": "ラベラーWebUIを開く",
+  "Open Tensorboard": "Tensorboardを開く",
+  "Opened labeler in browser": "ブラウザでラベラーを開きました",
+  "Optional Label Language": "オプションのラベル言語",
+  "Optional online ver": "オプションのオンラインバージョン",
+  "Output Path": "出力パス",
+  "Path error, please check the model file exists in the corresponding path": "パスエラー。対応するパスにモデルファイルが存在するか確認してください",
+  "Precision": "精度",
+  "Probability of applying Speaker Condition": "話者条件を適用する確率",
+  "Put your text here.": "ここにテキストを入力してください。",
+  "Reference Audio": "リファレンスオーディオ",
+  "Reference Text": "リファレンステキスト",
+  "Related code and weights are released under CC BY-NC-SA 4.0 License.": "関連コードと重みはCC BY-NC-SA 4.0ライセンスの下でリリースされます。",
+  "Remove Selected Data": "選択したデータを削除",
+  "Removed path successfully!": "パスの削除に成功しました！",
+  "Repetition Penalty": "反復ペナルティ",
+  "Save model every n steps": "nステップごとにモデルを保存",
+  "Select LLAMA ckpt": " LLAMA チェックポイントを選択",
+  "Select VITS ckpt": "VITS チェックポイントを選択",
+  "Select VQGAN ckpt": "VQGAN チェックポイントを選択",
+  "Select source file processing method": "ソースファイルの処理方法を選択",
+  "Select the model to be trained (Depending on the Tab page you are on)": "タブページに応じてトレーニングするモデルを選択してください",
+  "Selected: {}": "選択済み: {}",
+  "Speaker": "話者",
+  "Speaker is identified by the folder name": "話者はフォルダ名で識別されます",
+  "Start Training": "トレーニング開始",
+  "Streaming Audio": "ストリーミングオーディオ",
+  "Streaming Generate": "ストリーミング合成",
+  "Tensorboard Host": "Tensorboardホスト",
+  "Tensorboard Log Path": "Tensorboardログパス",
+  "Tensorboard Port": "Tensorboardポート",
+  "Tensorboard interface is closed": "Tensorboardインターフェースが閉じられています",
+  "Tensorboard interface is launched at {}": "Tensorboardインターフェースが{}で起動されました",
+  "Text is too long, please keep it under {} characters.": "テキストが長すぎます。{}文字以内に抑えてください。",
+  "The path of the input folder on the left or the filelist. Whether checked or not, it will be used for subsequent training in this list.": "左側の入力フォルダまたはファイルリストのパス。チェックの有無にかかわらず、このリストの後続のトレーニングに使用されます。",
+  "Training Configuration": "トレーニング設定",
+  "Training Error": "トレーニングエラー",
+  "Training stopped": "トレーニングが停止しました",
+  "Type name of the speaker": "話者の名前を入力",
+  "Type the path or select from the dropdown": "パスを入力するか、ドロップダウンから選択してください",
+  "Use LoRA": "LoRAを使用",
+  "Use LoRA can save GPU memory, but may reduce the quality of the model": "LoRAを使用するとGPUメモリを節約できますが、モデルの品質が低下する可能性があります",
+  "Use filelist": "ファイルリストを使用",
+  "Use large for 10G+ GPU, medium for 5G, small for 2G": "10G以上のGPUには大、5Gには中、2Gには小を使用してください",
+  "VITS Configuration": "VITS の構成",
+  "VQGAN Configuration": "VQGAN の構成",
+  "Validation Batch Size": "検証バッチサイズ",
+  "View the status of the preprocessing folder (use the slider to control the depth of the tree)": "前処理フォルダの状態を表示（スライダーを使用してツリーの深さを制御）",
+  "We are not responsible for any misuse of the model, please consider your local laws and regulations before using it.": "モデルの誤用については一切責任を負いません。使用する前に、現地の法律と規制を考慮してください。",
+  "WebUI Host": "WebUIホスト",
+  "WebUI Port": "WebUIポート",
+  "Whisper Model": "Whisperモデル",
+  "You can find the source code [here](https://github.com/fishaudio/fish-speech) and models [here](https://huggingface.co/fishaudio/fish-speech-1).": "ソースコードは[こちら](https://github.com/fishaudio/fish-speech)、モデルは[こちら](https://huggingface.co/fishaudio/fish-speech-1)にあります。",
+  "bf16-true is recommended for 30+ series GPU, 16-mixed is recommended for 10+ series GPU": "30シリーズ以降のGPUにはbf16-trueを、10シリーズ以降のGPUには16-mixedをお勧めします",
+  "latest": "最新",
+  "new": "新規",
+  "Realtime Transform Text": "リアルタイム変換テキスト",
+  "Normalization Result Preview (Currently Only Chinese)": "正規化結果プレビュー（現在は中国語のみ）",
+  "Text Normalization": "テキスト正規化",
+  "Select Example Audio": "サンプル音声を選択"
+}

fish_speech/i18n/locale/ko_KR.json CHANGED Viewed

@@ -1,123 +1,123 @@
-{
-  "16-mixed is recommended for 10+ series GPU": "10+ 시리즈 GPU에는 16-mixed를 권장합니다.",
-  "5 to 10 seconds of reference audio, useful for specifying speaker.": "화자를 특정하는 데 유의미한 5~10초의 길이의 참조 오디오 데이터.",
-  "A text-to-speech model based on VQ-GAN and Llama developed by [Fish Audio](https://fish.audio).": "[Fish Audio](https://fish.audio)에서 개발한 VQ-GAN 및 Llama 기반의 텍스트 음성 변환 모델.",
-  "Accumulate Gradient Batches": "그라디언트 배치 누적",
-  "Add to Processing Area": "처리 영역에 추가",
-  "Added path successfully!": "경로가 성공적으로 추가되었습니다!",
-  "Advanced Config": "고급 설정",
-  "Base LLAMA Model": "기본 LLAMA 모델",
-  "Batch Inference": "배치 추론",
-  "Batch Size": "배치 크기",
-  "Changing with the Model Path": "모델 경로에 따라 변경 중",
-  "Chinese": "중국어",
-  "Compile Model": "모델 컴파일",
-  "Compile the model can significantly reduce the inference time, but will increase cold start time": "모델을 컴파일하면 추론 시간이 크게 줄어들지만, 초기 시작 시간이 길어집니다.",
-  "Copy": "복사",
-  "Data Preprocessing": "데이터 전처리",
-  "Data Preprocessing Path": "데이터 전처리 경로",
-  "Data Source": "데이터 소스",
-  "Decoder Model Config": "디코더 모델 설정",
-  "Decoder Model Path": "디코더 모델 경로",
-  "Disabled": "비활성화 됨",
-  "Enable Reference Audio": "참고 음성 활성화",
-  "English": "영어",
-  "Error Message": "오류 메시지",
-  "File Preprocessing": "파일 전처리",
-  "Generate": "생성",
-  "Generated Audio": "생성된 오디오",
-  "If there is no corresponding text for the audio, apply ASR for assistance, support .txt or .lab format": "오디오애 대응하는 텍스트가 없을 경우, ASR을 적용해 지원하며, .txt 또는 .lab 형식을 지원합니다.",
-  "Infer interface is closed": "추론 인터페이스가 닫혔습니다.",
-  "Inference Configuration": "추론 설정",
-  "Inference Server Configuration": "추론 서버 설정",
-  "Inference Server Error": "추론 서버 오류",
-  "Inferring interface is launched at {}": "추론 인터페이스가 {}에서 시작되었습니다.",
-  "Initial Learning Rate": "초기 학습률",
-  "Input Audio & Source Path for Transcription": "전사할 입력 오디오 및 소스 경로",
-  "Input Text": "입력 텍스트",
-  "Invalid path: {}": "유효하지 않은 경로: {}",
-  "It is recommended to use CUDA, if you have low configuration, use CPU": "CUDA 사용을 권장하며, 낮은 사양일 경우 CPU를 사용하는 것을 권장합니다.",
-  "Iterative Prompt Length, 0 means off": "반복 프롬프트 길이. (0:비활성화)",
-  "Japanese": "일본어",
-  "LLAMA Configuration": "LLAMA 설정",
-  "LLAMA Model Config": "LLAMA 모델 설정",
-  "LLAMA Model Path": "LLAMA 모델 경로",
-  "Labeling Device": "라벨링 장치",
-  "LoRA Model to be merged": "병합할 LoRA 모델",
-  "Maximum Audio Duration": "최대 오디오 길이",
-  "Maximum Length per Sample": "샘플당 최대 길이",
-  "Maximum Training Steps": "최대 학습 단계",
-  "Maximum tokens per batch, 0 means no limit": "배치당 최대 토큰 수(0:제한 없음)",
-  "Merge": "병합",
-  "Merge LoRA": "LoRA 병합",
-  "Merge successfully": "성공적으로 병합 되었습니다.",
-  "Minimum Audio Duration": "최소 오디오 길이",
-  "Model Output Path": "모델 출력 경로",
-  "Model Size": "모델 크기",
-  "Move": "이동",
-  "Move files successfully": "파일이 성공적으로 이동되었습니다.",
-  "No audio generated, please check the input text.": "생성된 오디오가 없습니다. 입력된 텍스트를 확인하세요.",
-  "No selected options": "옵션이 선택되지 않았습니다.",
-  "Number of Workers": "작업자 수",
-  "Open Inference Server": "추론 서버 열기",
-  "Open Labeler WebUI": "라벨러 WebUI 열기",
-  "Open Tensorboard": "Tensorboard 열기",
-  "Opened labeler in browser": "브라우저에서 라벨러가 열렸습니다.",
-  "Optional Label Language": "선택적 라벨 언어",
-  "Optional online ver": "온라인 버전 선택",
-  "Output Path": "출력 경로",
-  "Path error, please check the model file exists in the corresponding path": "경로 오류, 해당 경로에 모델 파일이 있는지 확인하십시오.",
-  "Precision": "정밀도",
-  "Probability of applying Speaker Condition": "화자 조건 적용 확률",
-  "Put your text here.": "여기에 텍스트를 입력하세요.",
-  "Reference Audio": "참고 오디오",
-  "Reference Text": "참고 텍스트",
-  "Related code and weights are released under CC BY-NC-SA 4.0 License.": "관련 코드 및 가중치는 CC BY-NC-SA 4.0 라이선스 하에 배포됩니다.",
-  "Remove Selected Data": "선택한 데이터 제거",
-  "Removed path successfully!": "경로가 성공적으로 제거되었습니다!",
-  "Repetition Penalty": "반복 패널티",
-  "Save model every n steps": "n 단계마다 모델 저장",
-  "Select LLAMA ckpt": "LLAMA ckpt 선택",
-  "Select VITS ckpt": "VITS ckpt 선택",
-  "Select VQGAN ckpt": "VQGAN ckpt 선택",
-  "Select source file processing method": "소스 파일 처리 방법 선택",
-  "Select the model to be trained (Depending on the Tab page you are on)": "학습할 모델 선택(탭 페이지에 따라 다름)",
-  "Selected: {}": "선택됨: {}",
-  "Speaker": "화자",
-  "Speaker is identified by the folder name": "화자는 폴더 이름으로 식별됩니다",
-  "Start Training": "학습 시작",
-  "Streaming Audio": "스트리밍 오디오",
-  "Streaming Generate": "스트리밍 생성",
-  "Tensorboard Host": "Tensorboard 호스트",
-  "Tensorboard Log Path": "Tensorboard 로그 경로",
-  "Tensorboard Port": "Tensorboard 포트",
-  "Tensorboard interface is closed": "Tensorboard 인터페이스가 닫혔습니다",
-  "Tensorboard interface is launched at {}": "Tensorboard 인터페이스가 {}에서 시작되었습니다.",
-  "Text is too long, please keep it under {} characters.": "텍스트가 너무 깁니다. {}자 이하로 입력해주세요.",
-  "The path of the input folder on the left or the filelist. Whether checked or not, it will be used for subsequent training in this list.": "왼쪽의 입력 폴더 경로 또는 파일 목록의 경로. 체크 여부에 관계없이 이 목록에서 후속 학습에 사용됩니다.",
-  "Training Configuration": "학습 설정",
-  "Training Error": "학습 오류",
-  "Training stopped": "학습이 중지되었습니다.",
-  "Type name of the speaker": "화자의 이름을 입력하세요.",
-  "Type the path or select from the dropdown": "경로를 입력하거나 드롭다운에서 선택하세요.",
-  "Use LoRA": "LoRA 사용",
-  "Use LoRA can save GPU memory, but may reduce the quality of the model": "LoRA를 사용하면 GPU 메모리를 절약할 수 있지만, 모델의 품질이 저하될 수 있습니다.",
-  "Use filelist": "파일 목록 사용",
-  "Use large for 10G+ GPU, medium for 5G, small for 2G": "10G+ GPU 환경에선 large, 5G에선 medium, 2G에선 small을 사용할 것을 권장합니다.",
-  "VITS Configuration": "VITS 설정",
-  "VQGAN Configuration": "VQGAN 설정",
-  "Validation Batch Size": "검증 배치 크기",
-  "View the status of the preprocessing folder (use the slider to control the depth of the tree)": "전처리 폴더의 상태를 확인합니다(슬라이더를 사용하여 트리의 깊이를 조절합니다)",
-  "We are not responsible for any misuse of the model, please consider your local laws and regulations before using it.": "모델의 오용에 대해 책임지지 않습니다. 사용하기 전에 현지 법률과 규정을 고려하시길 바랍니다.",
-  "WebUI Host": "WebUI 호스트",
-  "WebUI Port": "WebUI 포트",
-  "Whisper Model": "Whisper 모델",
-  "You can find the source code [here](https://github.com/fishaudio/fish-speech) and models [here](https://huggingface.co/fishaudio/fish-speech-1).": "소스 코드는 [이곳](https://github.com/fishaudio/fish-speech)에서, 모델은 [이곳](https://huggingface.co/fishaudio/fish-speech-1)에서 확인하실 수 있습니다.",
-  "bf16-true is recommended for 30+ series GPU, 16-mixed is recommended for 10+ series GPU": "30+ 시리즈 GPU에는 bf16-true를, 10+ 시리즈 GPU에는 16-mixed를 권장합니다",
-  "latest": "최신",
-  "new": "새로운",
-  "Realtime Transform Text": "실시간 텍스트 변환",
-  "Normalization Result Preview (Currently Only Chinese)": "정규화 결과 미리보기(현재 중국어만 지원)",
-  "Text Normalization": "텍스트 정규화",
-  "Select Example Audio": "예시 오디오 선택"
-}

+{
+  "16-mixed is recommended for 10+ series GPU": "10+ 시리즈 GPU에는 16-mixed를 권장합니다.",
+  "5 to 10 seconds of reference audio, useful for specifying speaker.": "화자를 특정하는 데 유의미한 5~10초의 길이의 참조 오디오 데이터.",
+  "A text-to-speech model based on VQ-GAN and Llama developed by [Fish Audio](https://fish.audio).": "[Fish Audio](https://fish.audio)에서 개발한 VQ-GAN 및 Llama 기반의 텍스트 음성 변환 모델.",
+  "Accumulate Gradient Batches": "그라디언트 배치 누적",
+  "Add to Processing Area": "처리 영역에 추가",
+  "Added path successfully!": "경로가 성공적으로 추가되었습니다!",
+  "Advanced Config": "고급 설정",
+  "Base LLAMA Model": "기본 LLAMA 모델",
+  "Batch Inference": "배치 추론",
+  "Batch Size": "배치 크기",
+  "Changing with the Model Path": "모델 경로에 따라 변경 중",
+  "Chinese": "중국어",
+  "Compile Model": "모델 컴파일",
+  "Compile the model can significantly reduce the inference time, but will increase cold start time": "모델을 컴파일하면 추론 시간이 크게 줄어들지만, 초기 시작 시간이 길어집니다.",
+  "Copy": "복사",
+  "Data Preprocessing": "데이터 전처리",
+  "Data Preprocessing Path": "데이터 전처리 경로",
+  "Data Source": "데이터 소스",
+  "Decoder Model Config": "디코더 모델 설정",
+  "Decoder Model Path": "디코더 모델 경로",
+  "Disabled": "비활성화 됨",
+  "Enable Reference Audio": "참고 음성 활성화",
+  "English": "영어",
+  "Error Message": "오류 메시지",
+  "File Preprocessing": "파일 전처리",
+  "Generate": "생성",
+  "Generated Audio": "생성된 오디오",
+  "If there is no corresponding text for the audio, apply ASR for assistance, support .txt or .lab format": "오디오애 대응하는 텍스트가 없을 경우, ASR을 적용해 지원하며, .txt 또는 .lab 형식을 지원합니다.",
+  "Infer interface is closed": "추론 인터페이스가 닫혔습니다.",
+  "Inference Configuration": "추론 설정",
+  "Inference Server Configuration": "추론 서버 설정",
+  "Inference Server Error": "추론 서버 오류",
+  "Inferring interface is launched at {}": "추론 인터페이스가 {}에서 시작되었습니다.",
+  "Initial Learning Rate": "초기 학습률",
+  "Input Audio & Source Path for Transcription": "전사할 입력 오디오 및 소스 경로",
+  "Input Text": "입력 텍스트",
+  "Invalid path: {}": "유효하지 않은 경로: {}",
+  "It is recommended to use CUDA, if you have low configuration, use CPU": "CUDA 사용을 권장하며, 낮은 사양일 경우 CPU를 사용하는 것을 권장합니다.",
+  "Iterative Prompt Length, 0 means off": "반복 프롬프트 길이. (0:비활성화)",
+  "Japanese": "일본어",
+  "LLAMA Configuration": "LLAMA 설정",
+  "LLAMA Model Config": "LLAMA 모델 설정",
+  "LLAMA Model Path": "LLAMA 모델 경로",
+  "Labeling Device": "라벨링 장치",
+  "LoRA Model to be merged": "병합할 LoRA 모델",
+  "Maximum Audio Duration": "최대 오디오 길이",
+  "Maximum Length per Sample": "샘플당 최대 길이",
+  "Maximum Training Steps": "최대 학습 단계",
+  "Maximum tokens per batch, 0 means no limit": "배치당 최대 토큰 수(0:제한 없음)",
+  "Merge": "병합",
+  "Merge LoRA": "LoRA 병합",
+  "Merge successfully": "성공적으로 병합 되었습니다.",
+  "Minimum Audio Duration": "최소 오디오 길이",
+  "Model Output Path": "모델 출력 경로",
+  "Model Size": "모델 크기",
+  "Move": "이동",
+  "Move files successfully": "파일이 성공적으로 이동되었습니다.",
+  "No audio generated, please check the input text.": "생성된 오디오가 없습니다. 입력된 텍스트를 확인하세요.",
+  "No selected options": "옵션이 선택되지 않았습니다.",
+  "Number of Workers": "작업자 수",
+  "Open Inference Server": "추론 서버 열기",
+  "Open Labeler WebUI": "라벨러 WebUI 열기",
+  "Open Tensorboard": "Tensorboard 열기",
+  "Opened labeler in browser": "브라우저에서 라벨러가 열렸습니다.",
+  "Optional Label Language": "선택적 라벨 언어",
+  "Optional online ver": "온라인 버전 선택",
+  "Output Path": "출력 경로",
+  "Path error, please check the model file exists in the corresponding path": "경로 오류, 해당 경로에 모델 파일이 있는지 확인하십시오.",
+  "Precision": "정밀도",
+  "Probability of applying Speaker Condition": "화자 조건 적용 확률",
+  "Put your text here.": "여기에 텍스트를 입력하세요.",
+  "Reference Audio": "참고 오디오",
+  "Reference Text": "참고 텍스트",
+  "Related code and weights are released under CC BY-NC-SA 4.0 License.": "관련 코드 및 가중치는 CC BY-NC-SA 4.0 라이선스 하에 배포됩니다.",
+  "Remove Selected Data": "선택한 데이터 제거",
+  "Removed path successfully!": "경로가 성공적으로 제거되었습니다!",
+  "Repetition Penalty": "반복 패널티",
+  "Save model every n steps": "n 단계마다 모델 저장",
+  "Select LLAMA ckpt": "LLAMA ckpt 선택",
+  "Select VITS ckpt": "VITS ckpt 선택",
+  "Select VQGAN ckpt": "VQGAN ckpt 선택",
+  "Select source file processing method": "소스 파일 처리 방법 선택",
+  "Select the model to be trained (Depending on the Tab page you are on)": "학습할 모델 선택(탭 페이지에 따라 다름)",
+  "Selected: {}": "선택됨: {}",
+  "Speaker": "화자",
+  "Speaker is identified by the folder name": "화자는 폴더 이름으로 식별됩니다",
+  "Start Training": "학습 시작",
+  "Streaming Audio": "스트리밍 오디오",
+  "Streaming Generate": "스트리밍 생성",
+  "Tensorboard Host": "Tensorboard 호스트",
+  "Tensorboard Log Path": "Tensorboard 로그 경로",
+  "Tensorboard Port": "Tensorboard 포트",
+  "Tensorboard interface is closed": "Tensorboard 인터페이스가 닫혔습니다",
+  "Tensorboard interface is launched at {}": "Tensorboard 인터페이스가 {}에서 시작되었습니다.",
+  "Text is too long, please keep it under {} characters.": "텍스트가 너무 깁니다. {}자 이하로 입력해주세요.",
+  "The path of the input folder on the left or the filelist. Whether checked or not, it will be used for subsequent training in this list.": "왼쪽의 입력 폴더 경로 또는 파일 목록의 경로. 체크 여부에 관계없이 이 목록에서 후속 학습에 사용됩니다.",
+  "Training Configuration": "학습 설정",
+  "Training Error": "학습 오류",
+  "Training stopped": "학습이 중지되었습니다.",
+  "Type name of the speaker": "화자의 이름을 입력하세요.",
+  "Type the path or select from the dropdown": "경로를 입력하거나 드롭다운에서 선택하세요.",
+  "Use LoRA": "LoRA 사용",
+  "Use LoRA can save GPU memory, but may reduce the quality of the model": "LoRA를 사용하면 GPU 메모리를 절약할 수 있지만, 모델의 품질이 저하될 수 있습니다.",
+  "Use filelist": "파일 목록 사용",
+  "Use large for 10G+ GPU, medium for 5G, small for 2G": "10G+ GPU 환경에선 large, 5G에선 medium, 2G에선 small을 사용할 것을 권장합니다.",
+  "VITS Configuration": "VITS 설정",
+  "VQGAN Configuration": "VQGAN 설정",
+  "Validation Batch Size": "검증 배치 크기",
+  "View the status of the preprocessing folder (use the slider to control the depth of the tree)": "전처리 폴더의 상태를 확인합니다(슬라이더를 사용하여 트리의 깊이를 조절합니다)",
+  "We are not responsible for any misuse of the model, please consider your local laws and regulations before using it.": "모델의 오용에 대해 책임지지 않습니다. 사용하기 전에 현지 법률과 규정을 고려하시길 바랍니다.",
+  "WebUI Host": "WebUI 호스트",
+  "WebUI Port": "WebUI 포트",
+  "Whisper Model": "Whisper 모델",
+  "You can find the source code [here](https://github.com/fishaudio/fish-speech) and models [here](https://huggingface.co/fishaudio/fish-speech-1).": "소스 코드는 [이곳](https://github.com/fishaudio/fish-speech)에서, 모델은 [이곳](https://huggingface.co/fishaudio/fish-speech-1)에서 확인하실 수 있습니다.",
+  "bf16-true is recommended for 30+ series GPU, 16-mixed is recommended for 10+ series GPU": "30+ 시리즈 GPU에는 bf16-true를, 10+ 시리즈 GPU에는 16-mixed를 권장합니다",
+  "latest": "최신",
+  "new": "새로운",
+  "Realtime Transform Text": "실시간 텍스트 변환",
+  "Normalization Result Preview (Currently Only Chinese)": "정규화 결과 미리보기(현재 중국어만 지원)",
+  "Text Normalization": "텍스트 정규화",
+  "Select Example Audio": "예시 오디오 선택"
+}

fish_speech/i18n/locale/pt_BR.json CHANGED Viewed

@@ -1,133 +1,133 @@
-{
-  "5 to 10 seconds of reference audio, useful for specifying speaker.": "5 a 10 segundos de áudio de referência, útil para especificar o orador.",
-  "A text-to-speech model based on VQ-GAN and Llama developed by [Fish Audio](https://fish.audio).": "Um modelo de texto para fala baseado em VQ-GAN e Llama desenvolvido por [Fish Audio](https://fish.audio).",
-  "Accumulate Gradient Batches": "Acumular Lotes de Gradiente",
-  "Add to Processing Area": "Adicionar à Área de Processamento",
-  "Added path successfully!": "Caminho adicionado com sucesso!",
-  "Advanced Config": "Configuração Avançada",
-  "Base LLAMA Model": "Modelo LLAMA Base",
-  "Batch Inference": "Inferência em Lote",
-  "Batch Size": "Tamanho do Lote",
-  "Changing with the Model Path": "Alterando com o Caminho do Modelo",
-  "Compile Model": "Compilar Modelo",
-  "Compile the model can significantly reduce the inference time, but will increase cold start time": "Compilar o modelo pode reduzir significativamente o tempo de inferência, mas aumentará a latência inicial",
-  "Copy": "Copiar",
-  "Data Preprocessing": "Pré-processamento de Dados",
-  "Data Preprocessing Path": "Caminho de Pré-processamento de Dados",
-  "Data Source": "Fonte de Dados",
-  "Decoder Model Config": "Configuração do Modelo Decodificador",
-  "Decoder Model Path": "Caminho do Modelo Decodificador",
-  "Disabled": "Desativado",
-  "Enable Initial Prompt": "Habilitar Prompt Inicial",
-  "Enable Reference Audio": "Habilitar Áudio de Referência",
-  "English": "Inglês",
-  "Japanese": "Japonês",
-  "Chinese": "Chinês",
-  "Portuguese": "Português",
-  "Spanish": "Espanhol",
-  "Error Message": "Mensagem de Erro",
-  "Faster Whisper, Up to 5g GPU memory usage": "Faster Whisper (Usa até 5 GB de vRAM)",
-  "File Preprocessing": "Pré-processamento de Arquivos",
-  "Generate": "Gerar",
-  "Generated Audio": "Áudio Gerado",
-  "If there is no corresponding text for the audio, apply ASR for assistance, support .txt or .lab format": "Se não houver texto correspondente ao áudio, utilize o ASR para assistência (formatos .txt ou .lab)",
-  "Infer interface is closed": "A interface de inferência foi fechada",
-  "Inference Configuration": "Configuração de Inferência",
-  "Inference Server Configuration": "Configuração do Servidor de Inferência",
-  "Inference Server Error": "Erro do Servidor de Inferência",
-  "Inferring interface is launched at {}": "A interface de inferência foi iniciada em {}",
-  "Initial Learning Rate": "Taxa de Aprendizagem Inicial",
-  "Initial Prompt": "Prompt Inicial",
-  "Initial prompt can provide contextual or vocabulary-specific guidance to the model.": "O prompt inicial pode fornecer orientação contextual ou específica de vocabulário para o modelo.",
-  "Input Audio & Source Path for Transcription": "Entrada de Áudio/Caminho de Origem para Transcrição",
-  "Input Text": "Texto de Entrada",
-  "Invalid path: {}": "Caminho inválido: {}",
-  "It is recommended to use CUDA, if you have low configuration, use CPU": "Para GPUs Nvidia é recomendado usar CUDA. Se não tiver uma GPU Nvidia, use CPU",
-  "Iterative Prompt Length, 0 means off": "Comprimento do Prompt Iterativo (0 = desativado)",
-  "LLAMA Configuration": "Configuração do LLAMA",
-  "LLAMA Model Config": "Configuração do Modelo LLAMA",
-  "LLAMA Model Path": "Caminho do Modelo LLAMA",
-  "Labeling Device": "Dispositivo de Rotulagem",
-  "LoRA Model to be merged": "Modelo LoRA para mesclagem",
-  "Maximum Length per Sample": "Comprimento Máximo por Amostra",
-  "Maximum Training Steps": "Etapas Máximas de Treinamento",
-  "Maximum tokens per batch, 0 means no limit": "Número máximo de tokens por lote, 0 significa sem limite",
-  "Merge": "Mesclar",
-  "Merge LoRA": "Mesclar LoRA",
-  "Merge successfully": "Mesclado com sucesso",
-  "Model Output Path": "Caminho de Saída do Modelo",
-  "Model Quantization": "Quantização do Modelo",
-  "Model Size": "Tamanho do Modelo",
-  "Move": "Mover",
-  "Move files successfully": "Arquivos movidos com sucesso",
-  "No audio generated, please check the input text.": "Nenhum áudio gerado, verifique o texto de entrada.",
-  "No selected options": "Nenhuma opção selecionada",
-  "Normalization Result Preview (Currently Only Chinese)": "Pré-visualização do Resultado da Normalização (Atualmente Apenas Chinês)",
-  "Number of Workers": "Número de Processos",
-  "Open Inference Server": "Abrir Servidor de Inferência",
-  "Open Labeler WebUI": "Abrir WebUI de Rotulagem",
-  "Open Tensorboard": "Abrir Tensorboard",
-  "Opened labeler in browser": "WebUI de rotulagem aberta no navegador",
-  "Optional Label Language": "Idioma do Rótulo (Opcional)",
-  "Optional online ver": "Versão online (opcional)",
-  "Output Path": "Caminho de Saída",
-  "Path error, please check the model file exists in the corresponding path": "Erro de caminho, verifique se o arquivo do modelo existe no caminho correspondente",
-  "Post-quantification Precision": "Precisão Pós-quantização",
-  "Precision": "Precisão",
-  "Probability of applying Speaker Condition": "Probabilidade de Aplicar Condição de Orador",
-  "Put your text here.": "Insira seu texto aqui.",
-  "Quantify": "Quantizar",
-  "Quantify successfully": "Quantizado com sucesso",
-  "Realtime Transform Text": "Transformar Texto em Tempo Real",
-  "Reference Audio": "Áudio de Referência",
-  "Reference Text": "Texto de Referência",
-  "warning": "Aviso",
-  "Pre-processing begins...": "O pré-processamento começou!",
-  "Related code and weights are released under CC BY-NC-SA 4.0 License.": "O código relacionado e os pesos são licenciados sob a Licença CC BY-NC-SA 4.0.",
-  "Remove Selected Data": "Remover Dados Selecionados",
-  "Removed path successfully!": "Caminho removido com sucesso!",
-  "Repetition Penalty": "Penalidade de Repetição",
-  "Save model every n steps": "Salvar modelo a cada n etapas",
-  "Select LLAMA ckpt": "Selecionar .ckpt do LLAMA",
-  "Select source file processing method": "Escolha como processar o arquivo de origem",
-  "Select the model to be trained (Depending on the Tab page you are on)": "Selecione o modelo para o treinamento (dependendo da aba em que você está)",
-  "Selected: {}": "Selecionado: {}",
-  "Speaker is identified by the folder name": "O orador é identificado pelo nome da pasta",
-  "Start Training": "Iniciar Treinamento",
-  "Streaming Audio": "Áudio em Streaming",
-  "Streaming Generate": "Geração em Streaming",
-  "Tensorboard Host": "Host do Tensorboard",
-  "Tensorboard Log Path": "Caminho de Log do Tensorboard",
-  "Tensorboard Port": "Porta do Tensorboard",
-  "Tensorboard interface is closed": "A interface do Tensorboard está fechada",
-  "Tensorboard interface is launched at {}": "A interface do Tensorboard foi iniciada em {}",
-  "Text Normalization": "Normalização de Texto",
-  "Text is too long, please keep it under {} characters.": "O texto é muito longo. Mantenha-o com menos de {} caracteres.",
-  "The lower the quantitative precision, the more the effectiveness may decrease, but the greater the efficiency will increase": "Quanto menor a precisão quantitativa, mais a eficácia pode diminuir, mas maior será o aumento da eficiência",
-  "The path of the input folder on the left or the filelist. Whether checked or not, it will be used for subsequent training in this list.": "O caminho da pasta de entrada à esquerda ou a lista de arquivos. Independentemente de estar marcada ou não, ela será utilizada para o treinamento subsequente nesta lista.",
-  "Training Configuration": "Configuração de Treinamento",
-  "Training Error": "Erro de Treinamento",
-  "Training stopped": "Treinamento interrompido!",
-  "Type the path or select from the dropdown": "Digite o caminho ou selecione no menu suspenso",
-  "Use LoRA": "Usar LoRA",
-  "Use LoRA can save GPU memory, but may reduce the quality of the model": "O uso de LoRAs pode economizar memória da GPU, mas também pode reduzir a qualidade",
-  "Use filelist": "Usar lista de arquivos",
-  "VQGAN Configuration": "Configuração do VQGAN",
-  "View the status of the preprocessing folder (use the slider to control the depth of the tree)": "Visualizar o status da pasta de pré-processamento (use o controle deslizante para controlar a profundidade da árvore)",
-  "We are not responsible for any misuse of the model, please consider your local laws and regulations before using it.": "Não nos responsabilizamos por qualquer uso indevido do modelo. Por favor, considere as leis e regulamentações locais antes de usá-lo.",
-  "WebUI Host": "Host da WebUI",
-  "WebUI Port": "Porta da WebUI",
-  "Whisper Model": "Modelo Whisper",
-  "You can find the source code [here](https://github.com/fishaudio/fish-speech) and models [here](https://huggingface.co/fishaudio/fish-speech-1).": "Você pode encontrar o código fonte [aqui](https://github.com/fishaudio/fish-speech) e os modelos [aqui](https://huggingface.co/fishaudio/fish-speech-1).",
-  "auto": "automático",
-  "bf16-true is recommended for 30+ series GPU, 16-mixed is recommended for 10+ series GPU": "bf16-true é recomendado para GPUs da série 30+, 16-mixed é recomendado para GPUs da série 10+",
-  "latest": "mais recente",
-  "new": "novo",
-  "This audio introduces the basic concepts and applications of artificial intelligence and machine learning.": "Este áudio introduz os conceitos básicos e aplicações de inteligência artificial e aprendizado de máquina.",
-  "You don't need to train this model!": "Não é necessário treinar este modelo!",
-  "Yes": "Sim",
-  "No": "Não",
-  "version:": "versão:",
-  "author:": "autor:"
-}

+{
+  "5 to 10 seconds of reference audio, useful for specifying speaker.": "5 a 10 segundos de áudio de referência, útil para especificar o orador.",
+  "A text-to-speech model based on VQ-GAN and Llama developed by [Fish Audio](https://fish.audio).": "Um modelo de texto para fala baseado em VQ-GAN e Llama desenvolvido por [Fish Audio](https://fish.audio).",
+  "Accumulate Gradient Batches": "Acumular Lotes de Gradiente",
+  "Add to Processing Area": "Adicionar à Área de Processamento",
+  "Added path successfully!": "Caminho adicionado com sucesso!",
+  "Advanced Config": "Configuração Avançada",
+  "Base LLAMA Model": "Modelo LLAMA Base",
+  "Batch Inference": "Inferência em Lote",
+  "Batch Size": "Tamanho do Lote",
+  "Changing with the Model Path": "Alterando com o Caminho do Modelo",
+  "Compile Model": "Compilar Modelo",
+  "Compile the model can significantly reduce the inference time, but will increase cold start time": "Compilar o modelo pode reduzir significativamente o tempo de inferência, mas aumentará a latência inicial",
+  "Copy": "Copiar",
+  "Data Preprocessing": "Pré-processamento de Dados",
+  "Data Preprocessing Path": "Caminho de Pré-processamento de Dados",
+  "Data Source": "Fonte de Dados",
+  "Decoder Model Config": "Configuração do Modelo Decodificador",
+  "Decoder Model Path": "Caminho do Modelo Decodificador",
+  "Disabled": "Desativado",
+  "Enable Initial Prompt": "Habilitar Prompt Inicial",
+  "Enable Reference Audio": "Habilitar Áudio de Referência",
+  "English": "Inglês",
+  "Japanese": "Japonês",
+  "Chinese": "Chinês",
+  "Portuguese": "Português",
+  "Spanish": "Espanhol",
+  "Error Message": "Mensagem de Erro",
+  "Faster Whisper, Up to 5g GPU memory usage": "Faster Whisper (Usa até 5 GB de vRAM)",
+  "File Preprocessing": "Pré-processamento de Arquivos",
+  "Generate": "Gerar",
+  "Generated Audio": "Áudio Gerado",
+  "If there is no corresponding text for the audio, apply ASR for assistance, support .txt or .lab format": "Se não houver texto correspondente ao áudio, utilize o ASR para assistência (formatos .txt ou .lab)",
+  "Infer interface is closed": "A interface de inferência foi fechada",
+  "Inference Configuration": "Configuração de Inferência",
+  "Inference Server Configuration": "Configuração do Servidor de Inferência",
+  "Inference Server Error": "Erro do Servidor de Inferência",
+  "Inferring interface is launched at {}": "A interface de inferência foi iniciada em {}",
+  "Initial Learning Rate": "Taxa de Aprendizagem Inicial",
+  "Initial Prompt": "Prompt Inicial",
+  "Initial prompt can provide contextual or vocabulary-specific guidance to the model.": "O prompt inicial pode fornecer orientação contextual ou específica de vocabulário para o modelo.",
+  "Input Audio & Source Path for Transcription": "Entrada de Áudio/Caminho de Origem para Transcrição",
+  "Input Text": "Texto de Entrada",
+  "Invalid path: {}": "Caminho inválido: {}",
+  "It is recommended to use CUDA, if you have low configuration, use CPU": "Para GPUs Nvidia é recomendado usar CUDA. Se não tiver uma GPU Nvidia, use CPU",
+  "Iterative Prompt Length, 0 means off": "Comprimento do Prompt Iterativo (0 = desativado)",
+  "LLAMA Configuration": "Configuração do LLAMA",
+  "LLAMA Model Config": "Configuração do Modelo LLAMA",
+  "LLAMA Model Path": "Caminho do Modelo LLAMA",
+  "Labeling Device": "Dispositivo de Rotulagem",
+  "LoRA Model to be merged": "Modelo LoRA para mesclagem",
+  "Maximum Length per Sample": "Comprimento Máximo por Amostra",
+  "Maximum Training Steps": "Etapas Máximas de Treinamento",
+  "Maximum tokens per batch, 0 means no limit": "Número máximo de tokens por lote, 0 significa sem limite",
+  "Merge": "Mesclar",
+  "Merge LoRA": "Mesclar LoRA",
+  "Merge successfully": "Mesclado com sucesso",
+  "Model Output Path": "Caminho de Saída do Modelo",
+  "Model Quantization": "Quantização do Modelo",
+  "Model Size": "Tamanho do Modelo",
+  "Move": "Mover",
+  "Move files successfully": "Arquivos movidos com sucesso",
+  "No audio generated, please check the input text.": "Nenhum áudio gerado, verifique o texto de entrada.",
+  "No selected options": "Nenhuma opção selecionada",
+  "Normalization Result Preview (Currently Only Chinese)": "Pré-visualização do Resultado da Normalização (Atualmente Apenas Chinês)",
+  "Number of Workers": "Número de Processos",
+  "Open Inference Server": "Abrir Servidor de Inferência",
+  "Open Labeler WebUI": "Abrir WebUI de Rotulagem",
+  "Open Tensorboard": "Abrir Tensorboard",
+  "Opened labeler in browser": "WebUI de rotulagem aberta no navegador",
+  "Optional Label Language": "Idioma do Rótulo (Opcional)",
+  "Optional online ver": "Versão online (opcional)",
+  "Output Path": "Caminho de Saída",
+  "Path error, please check the model file exists in the corresponding path": "Erro de caminho, verifique se o arquivo do modelo existe no caminho correspondente",
+  "Post-quantification Precision": "Precisão Pós-quantização",
+  "Precision": "Precisão",
+  "Probability of applying Speaker Condition": "Probabilidade de Aplicar Condição de Orador",
+  "Put your text here.": "Insira seu texto aqui.",
+  "Quantify": "Quantizar",
+  "Quantify successfully": "Quantizado com sucesso",
+  "Realtime Transform Text": "Transformar Texto em Tempo Real",
+  "Reference Audio": "Áudio de Referência",
+  "Reference Text": "Texto de Referência",
+  "warning": "Aviso",
+  "Pre-processing begins...": "O pré-processamento começou!",
+  "Related code and weights are released under CC BY-NC-SA 4.0 License.": "O código relacionado e os pesos são licenciados sob a Licença CC BY-NC-SA 4.0.",
+  "Remove Selected Data": "Remover Dados Selecionados",
+  "Removed path successfully!": "Caminho removido com sucesso!",
+  "Repetition Penalty": "Penalidade de Repetição",
+  "Save model every n steps": "Salvar modelo a cada n etapas",
+  "Select LLAMA ckpt": "Selecionar .ckpt do LLAMA",
+  "Select source file processing method": "Escolha como processar o arquivo de origem",
+  "Select the model to be trained (Depending on the Tab page you are on)": "Selecione o modelo para o treinamento (dependendo da aba em que você está)",
+  "Selected: {}": "Selecionado: {}",
+  "Speaker is identified by the folder name": "O orador é identificado pelo nome da pasta",
+  "Start Training": "Iniciar Treinamento",
+  "Streaming Audio": "Áudio em Streaming",
+  "Streaming Generate": "Geração em Streaming",
+  "Tensorboard Host": "Host do Tensorboard",
+  "Tensorboard Log Path": "Caminho de Log do Tensorboard",
+  "Tensorboard Port": "Porta do Tensorboard",
+  "Tensorboard interface is closed": "A interface do Tensorboard está fechada",
+  "Tensorboard interface is launched at {}": "A interface do Tensorboard foi iniciada em {}",
+  "Text Normalization": "Normalização de Texto",
+  "Text is too long, please keep it under {} characters.": "O texto é muito longo. Mantenha-o com menos de {} caracteres.",
+  "The lower the quantitative precision, the more the effectiveness may decrease, but the greater the efficiency will increase": "Quanto menor a precisão quantitativa, mais a eficácia pode diminuir, mas maior será o aumento da eficiência",
+  "The path of the input folder on the left or the filelist. Whether checked or not, it will be used for subsequent training in this list.": "O caminho da pasta de entrada à esquerda ou a lista de arquivos. Independentemente de estar marcada ou não, ela será utilizada para o treinamento subsequente nesta lista.",
+  "Training Configuration": "Configuração de Treinamento",
+  "Training Error": "Erro de Treinamento",
+  "Training stopped": "Treinamento interrompido!",
+  "Type the path or select from the dropdown": "Digite o caminho ou selecione no menu suspenso",
+  "Use LoRA": "Usar LoRA",
+  "Use LoRA can save GPU memory, but may reduce the quality of the model": "O uso de LoRAs pode economizar memória da GPU, mas também pode reduzir a qualidade",
+  "Use filelist": "Usar lista de arquivos",
+  "VQGAN Configuration": "Configuração do VQGAN",
+  "View the status of the preprocessing folder (use the slider to control the depth of the tree)": "Visualizar o status da pasta de pré-processamento (use o controle deslizante para controlar a profundidade da árvore)",
+  "We are not responsible for any misuse of the model, please consider your local laws and regulations before using it.": "Não nos responsabilizamos por qualquer uso indevido do modelo. Por favor, considere as leis e regulamentações locais antes de usá-lo.",
+  "WebUI Host": "Host da WebUI",
+  "WebUI Port": "Porta da WebUI",
+  "Whisper Model": "Modelo Whisper",
+  "You can find the source code [here](https://github.com/fishaudio/fish-speech) and models [here](https://huggingface.co/fishaudio/fish-speech-1).": "Você pode encontrar o código fonte [aqui](https://github.com/fishaudio/fish-speech) e os modelos [aqui](https://huggingface.co/fishaudio/fish-speech-1).",
+  "auto": "automático",
+  "bf16-true is recommended for 30+ series GPU, 16-mixed is recommended for 10+ series GPU": "bf16-true é recomendado para GPUs da série 30+, 16-mixed é recomendado para GPUs da série 10+",
+  "latest": "mais recente",
+  "new": "novo",
+  "This audio introduces the basic concepts and applications of artificial intelligence and machine learning.": "Este áudio introduz os conceitos básicos e aplicações de inteligência artificial e aprendizado de máquina.",
+  "You don't need to train this model!": "Não é necessário treinar este modelo!",
+  "Yes": "Sim",
+  "No": "Não",
+  "version:": "versão:",
+  "author:": "autor:"
+}

fish_speech/i18n/locale/zh_CN.json CHANGED Viewed

@@ -1,123 +1,123 @@
-{
-  "16-mixed is recommended for 10+ series GPU": "10+ 系列 GPU 建议使用 16-mixed",
-  "5 to 10 seconds of reference audio, useful for specifying speaker.": "5 到 10 秒的参考音频，适用于指定音色。",
-  "A text-to-speech model based on VQ-GAN and Llama developed by [Fish Audio](https://fish.audio).": "由 [Fish Audio](https://fish.audio) 研发的基于 VQ-GAN 和 Llama 的多语种语音合成.",
-  "Accumulate Gradient Batches": "梯度累积批次",
-  "Add to Processing Area": "加入处理区",
-  "Added path successfully!": "添加路径成功!",
-  "Advanced Config": "高级参数",
-  "Base LLAMA Model": "基础 LLAMA 模型",
-  "Batch Inference": "批量推理",
-  "Batch Size": "批次大小",
-  "Changing with the Model Path": "随模型路径变化",
-  "Chinese": "中文",
-  "Compile Model": "编译模型",
-  "Compile the model can significantly reduce the inference time, but will increase cold start time": "编译模型可以显著减少推理时间，但会增加冷启动时间",
-  "Copy": "复制",
-  "Data Preprocessing": "数据预处理",
-  "Data Preprocessing Path": "数据预处理路径",
-  "Data Source": "数据源",
-  "Decoder Model Config": "解码器模型配置",
-  "Decoder Model Path": "解码器模型路径",
-  "Disabled": "禁用",
-  "Enable Reference Audio": "启用参考音频",
-  "English": "英文",
-  "Error Message": "错误信息",
-  "File Preprocessing": "文件预处理",
-  "Generate": "生成",
-  "Generated Audio": "音频",
-  "If there is no corresponding text for the audio, apply ASR for assistance, support .txt or .lab format": "如果音频没有对应的文本，可以应用 ASR 辅助，支持 .txt 或 .lab 格式",
-  "Infer interface is closed": "推理界面已关闭",
-  "Inference Configuration": "推理配置",
-  "Inference Server Configuration": "推理服务器配置",
-  "Inference Server Error": "推理服务器错误",
-  "Inferring interface is launched at {}": "推理界面已在 {} 上启动",
-  "Initial Learning Rate": "初始学习率",
-  "Input Audio & Source Path for Transcription": "输入音频和转录源路径",
-  "Input Text": "输入文本",
-  "Invalid path: {}": "无效路径: {}",
-  "It is recommended to use CUDA, if you have low configuration, use CPU": "建议使用 CUDA，如果配置较低，使用 CPU",
-  "Iterative Prompt Length, 0 means off": "迭代提示长度，0 表示关闭",
-  "Japanese": "日文",
-  "LLAMA Configuration": "LLAMA 配置",
-  "LLAMA Model Config": "LLAMA 模型配置",
-  "LLAMA Model Path": "LLAMA 模型路径",
-  "Labeling Device": "标注加速设备",
-  "LoRA Model to be merged": "要合并的 LoRA 模型",
-  "Maximum Audio Duration": "最大音频时长",
-  "Maximum Length per Sample": "每个样本的最大长度",
-  "Maximum Training Steps": "最大训练步数",
-  "Maximum tokens per batch, 0 means no limit": "每批最大令牌数，0 表示无限制",
-  "Merge": "合并",
-  "Merge LoRA": "合并 LoRA",
-  "Merge successfully": "合并成功",
-  "Minimum Audio Duration": "最小音频时长",
-  "Model Output Path": "模型输出路径",
-  "Model Size": "模型规模",
-  "Move": "移动",
-  "Move files successfully": "移动文件成功",
-  "No audio generated, please check the input text.": "没有生成音频，请检查输入文本.",
-  "No selected options": "没有选择的选项",
-  "Number of Workers": "数据加载进程数",
-  "Open Inference Server": "打开推理服务器",
-  "Open Labeler WebUI": "打开标注工具",
-  "Open Tensorboard": "打开 Tensorboard",
-  "Opened labeler in browser": "在浏览器中打开标注工具",
-  "Optional Label Language": "[可选] 标注语言",
-  "Optional online ver": "[可选] 使用在线版",
-  "Output Path": "输出路径",
-  "Path error, please check the model file exists in the corresponding path": "路径错误，请检查模型文件是否存在于相应路径",
-  "Precision": "精度",
-  "Probability of applying Speaker Condition": "应用说话人条件的概率",
-  "Put your text here.": "在此处输入文本.",
-  "Reference Audio": "参考音频",
-  "Reference Text": "参考文本",
-  "Related code and weights are released under CC BY-NC-SA 4.0 License.": "相关代码和权重使用 CC BY-NC-SA 4.0 许可证发布.",
-  "Remove Selected Data": "移除选中数据",
-  "Removed path successfully!": "移除路径成功!",
-  "Repetition Penalty": "重复惩罚",
-  "Save model every n steps": "每 n 步保存模型",
-  "Select LLAMA ckpt": "选择 LLAMA 检查点",
-  "Select VITS ckpt": "选择 VITS 检查点",
-  "Select VQGAN ckpt": "选择 VQGAN 检查点",
-  "Select source file processing method": "选择源文件处理方法",
-  "Select the model to be trained (Depending on the Tab page you are on)": "根据您所在的选项卡页面选择要训练的模型",
-  "Selected: {}": "已选择: {}",
-  "Speaker": "说话人",
-  "Speaker is identified by the folder name": "自动根据父目录名称识别说话人",
-  "Start Training": "开始训练",
-  "Streaming Audio": "流式音频",
-  "Streaming Generate": "流式合成",
-  "Tensorboard Host": "Tensorboard 监听地址",
-  "Tensorboard Log Path": "Tensorboard 日志路径",
-  "Tensorboard Port": "Tensorboard 端口",
-  "Tensorboard interface is closed": "Tensorboard 界面已关闭",
-  "Tensorboard interface is launched at {}": "Tensorboard 界面已在 {} 上启动",
-  "Text is too long, please keep it under {} characters.": "文本太长，请保持在 {} 个字符以内.",
-  "The path of the input folder on the left or the filelist. Whether checked or not, it will be used for subsequent training in this list.": "左侧输入文件夹的路径或文件列表。无论是否选中，都将在此列表中用于后续训练.",
-  "Training Configuration": "训练配置",
-  "Training Error": "训练错误",
-  "Training stopped": "训练已停止",
-  "Type name of the speaker": "输入说话人的名称",
-  "Type the path or select from the dropdown": "输入路径或从下拉菜单中选择",
-  "Use LoRA": "使用 LoRA",
-  "Use LoRA can save GPU memory, but may reduce the quality of the model": "使用 LoRA 可以节省 GPU 内存，但可能会降低模型质量",
-  "Use filelist": "使用文件列表",
-  "Use large for 10G+ GPU, medium for 5G, small for 2G": "10G+ GPU 使用 large, 5G 使用 medium, 2G 使用 small",
-  "VITS Configuration": "VITS 配置",
-  "VQGAN Configuration": "VQGAN 配置",
-  "Validation Batch Size": "验证批次大小",
-  "View the status of the preprocessing folder (use the slider to control the depth of the tree)": "查看预处理文件夹的状态 (使用滑块控制树的深度)",
-  "We are not responsible for any misuse of the model, please consider your local laws and regulations before using it.": "我们不对模型的任何滥用负责，请在使用之前考虑您当地的法律法规.",
-  "WebUI Host": "WebUI 监听地址",
-  "WebUI Port": "WebUI 端口",
-  "Whisper Model": "Whisper 模型",
-  "You can find the source code [here](https://github.com/fishaudio/fish-speech) and models [here](https://huggingface.co/fishaudio/fish-speech-1).": "你可以在 [这里](https://github.com/fishaudio/fish-speech) 找到源代码和 [这里](https://huggingface.co/fishaudio/fish-speech-1) 找到模型.",
-  "bf16-true is recommended for 30+ series GPU, 16-mixed is recommended for 10+ series GPU": "30+ 系列 GPU 建议使用 bf16-true, 10+ 系列 GPU 建议使用 16-mixed",
-  "latest": "最近的检查点",
-  "new": "创建新的检查点",
-  "Realtime Transform Text": "实时规范化文本",
-  "Normalization Result Preview (Currently Only Chinese)": "规范化结果预览",
-  "Text Normalization": "文本规范化",
-  "Select Example Audio": "选择参考音频"
-}

+{
+  "16-mixed is recommended for 10+ series GPU": "10+ 系列 GPU 建议使用 16-mixed",
+  "5 to 10 seconds of reference audio, useful for specifying speaker.": "5 到 10 秒的参考音频，适用于指定音色。",
+  "A text-to-speech model based on VQ-GAN and Llama developed by [Fish Audio](https://fish.audio).": "由 [Fish Audio](https://fish.audio) 研发的基于 VQ-GAN 和 Llama 的多语种语音合成.",
+  "Accumulate Gradient Batches": "梯度累积批次",
+  "Add to Processing Area": "加入处理区",
+  "Added path successfully!": "添加路径成功!",
+  "Advanced Config": "高级参数",
+  "Base LLAMA Model": "基础 LLAMA 模型",
+  "Batch Inference": "批量推理",
+  "Batch Size": "批次大小",
+  "Changing with the Model Path": "随模型路径变化",
+  "Chinese": "中文",
+  "Compile Model": "编译模型",
+  "Compile the model can significantly reduce the inference time, but will increase cold start time": "编译模型可以显著减少推理时间，但会增加冷启动时间",
+  "Copy": "复制",
+  "Data Preprocessing": "数据预处理",
+  "Data Preprocessing Path": "数据预处理路径",
+  "Data Source": "数据源",
+  "Decoder Model Config": "解码器模型配置",
+  "Decoder Model Path": "解码器模型路径",
+  "Disabled": "禁用",
+  "Enable Reference Audio": "启用参考音频",
+  "English": "英文",
+  "Error Message": "错误信息",
+  "File Preprocessing": "文件预处理",
+  "Generate": "生成",
+  "Generated Audio": "音频",
+  "If there is no corresponding text for the audio, apply ASR for assistance, support .txt or .lab format": "如果音频没有对应的文本，可以应用 ASR 辅助，支持 .txt 或 .lab 格式",
+  "Infer interface is closed": "推理界面已关闭",
+  "Inference Configuration": "推理配置",
+  "Inference Server Configuration": "推理服务器配置",
+  "Inference Server Error": "推理服务器错误",
+  "Inferring interface is launched at {}": "推理界面已在 {} 上启动",
+  "Initial Learning Rate": "初始学习率",
+  "Input Audio & Source Path for Transcription": "输入音频和转录源路径",
+  "Input Text": "输入文本",
+  "Invalid path: {}": "无效���径: {}",
+  "It is recommended to use CUDA, if you have low configuration, use CPU": "建议使用 CUDA，如果配置较低，使用 CPU",
+  "Iterative Prompt Length, 0 means off": "迭代提示长度，0 表示关闭",
+  "Japanese": "日文",
+  "LLAMA Configuration": "LLAMA 配置",
+  "LLAMA Model Config": "LLAMA 模型配置",
+  "LLAMA Model Path": "LLAMA 模型路径",
+  "Labeling Device": "标注加速设备",
+  "LoRA Model to be merged": "要合并的 LoRA 模型",
+  "Maximum Audio Duration": "最大音频时长",
+  "Maximum Length per Sample": "每个样本的最大长度",
+  "Maximum Training Steps": "最大训练步数",
+  "Maximum tokens per batch, 0 means no limit": "每批最大令牌数，0 表示无限制",
+  "Merge": "合并",
+  "Merge LoRA": "合并 LoRA",
+  "Merge successfully": "合并成功",
+  "Minimum Audio Duration": "最小音频时长",
+  "Model Output Path": "模型输出路径",
+  "Model Size": "模型规模",
+  "Move": "移动",
+  "Move files successfully": "移动文件成功",
+  "No audio generated, please check the input text.": "没有生成音频，请检查输入文本.",
+  "No selected options": "没有选择的选项",
+  "Number of Workers": "数据加载进程数",
+  "Open Inference Server": "打开推理服务器",
+  "Open Labeler WebUI": "打开标注工具",
+  "Open Tensorboard": "打开 Tensorboard",
+  "Opened labeler in browser": "在浏览器中打开标注工具",
+  "Optional Label Language": "[可选] 标注语言",
+  "Optional online ver": "[可选] 使用在线版",
+  "Output Path": "输出路径",
+  "Path error, please check the model file exists in the corresponding path": "路径错误，请检查模型文件是否存在于相应路径",
+  "Precision": "精度",
+  "Probability of applying Speaker Condition": "应用说话人条件的概率",
+  "Put your text here.": "在此处输入文本.",
+  "Reference Audio": "参考音频",
+  "Reference Text": "参考文本",
+  "Related code and weights are released under CC BY-NC-SA 4.0 License.": "相关代码和权重使用 CC BY-NC-SA 4.0 许可证发布.",
+  "Remove Selected Data": "移除选中数据",
+  "Removed path successfully!": "移除路径成功!",
+  "Repetition Penalty": "重复惩罚",
+  "Save model every n steps": "每 n 步保存模型",
+  "Select LLAMA ckpt": "选择 LLAMA 检查点",
+  "Select VITS ckpt": "选择 VITS 检查点",
+  "Select VQGAN ckpt": "选择 VQGAN 检查点",
+  "Select source file processing method": "选择源文件处理方法",
+  "Select the model to be trained (Depending on the Tab page you are on)": "根据您所在的选项卡页面选择要训练的模型",
+  "Selected: {}": "已选择: {}",
+  "Speaker": "说话人",
+  "Speaker is identified by the folder name": "自动根据父目录名称识别说话人",
+  "Start Training": "开始训练",
+  "Streaming Audio": "流式音频",
+  "Streaming Generate": "流式合成",
+  "Tensorboard Host": "Tensorboard 监听地址",
+  "Tensorboard Log Path": "Tensorboard 日志路径",
+  "Tensorboard Port": "Tensorboard 端口",
+  "Tensorboard interface is closed": "Tensorboard 界面已关闭",
+  "Tensorboard interface is launched at {}": "Tensorboard 界面已在 {} 上启动",
+  "Text is too long, please keep it under {} characters.": "文本太长，请保持在 {} 个字符以内.",
+  "The path of the input folder on the left or the filelist. Whether checked or not, it will be used for subsequent training in this list.": "左侧输入文件夹的路径或文件列表。无论是否选中，都将在此列表中用于后续训练.",
+  "Training Configuration": "训练配置",
+  "Training Error": "训练错误",
+  "Training stopped": "训练已停止",
+  "Type name of the speaker": "输入说话人的名称",
+  "Type the path or select from the dropdown": "输入路径或从下拉菜单中选择",
+  "Use LoRA": "使用 LoRA",
+  "Use LoRA can save GPU memory, but may reduce the quality of the model": "使用 LoRA 可以节省 GPU 内存，但可能会降低模型质量",
+  "Use filelist": "使用文件列表",
+  "Use large for 10G+ GPU, medium for 5G, small for 2G": "10G+ GPU 使用 large, 5G 使用 medium, 2G 使用 small",
+  "VITS Configuration": "VITS 配置",
+  "VQGAN Configuration": "VQGAN 配置",
+  "Validation Batch Size": "验证批次大小",
+  "View the status of the preprocessing folder (use the slider to control the depth of the tree)": "查看预处理文件夹的状态 (使用滑块控制树的深度)",
+  "We are not responsible for any misuse of the model, please consider your local laws and regulations before using it.": "我们不对模型的任何滥用负责，请在使用之前考虑您当地的法律法规.",
+  "WebUI Host": "WebUI 监听地址",
+  "WebUI Port": "WebUI 端口",
+  "Whisper Model": "Whisper 模型",
+  "You can find the source code [here](https://github.com/fishaudio/fish-speech) and models [here](https://huggingface.co/fishaudio/fish-speech-1).": "你可以在 [这里](https://github.com/fishaudio/fish-speech) 找到源代码和 [这里](https://huggingface.co/fishaudio/fish-speech-1) 找到模型.",
+  "bf16-true is recommended for 30+ series GPU, 16-mixed is recommended for 10+ series GPU": "30+ 系列 GPU 建议使用 bf16-true, 10+ 系列 GPU 建议使用 16-mixed",
+  "latest": "最近的检查点",
+  "new": "创建新的检查点",
+  "Realtime Transform Text": "实时规范化文本",
+  "Normalization Result Preview (Currently Only Chinese)": "规范化结果预览",
+  "Text Normalization": "文本规范化",
+  "Select Example Audio": "选择参考音频"
+}

fish_speech/i18n/scan.py CHANGED Viewed

@@ -1,122 +1,122 @@
-import ast
-import glob
-import json
-from collections import OrderedDict
-from pathlib import Path
-from loguru import logger
-from .core import DEFAULT_LANGUAGE, I18N_FILE_PATH
-def extract_i18n_strings(node):
-    i18n_strings = []
-    if (
-        isinstance(node, ast.Call)
-        and isinstance(node.func, ast.Name)
-        and node.func.id == "i18n"
-    ):
-        for arg in node.args:
-            if isinstance(arg, ast.Str):
-                i18n_strings.append(arg.s)
-    for child_node in ast.iter_child_nodes(node):
-        i18n_strings.extend(extract_i18n_strings(child_node))
-    return i18n_strings
-# scan the directory for all .py files (recursively)
-# for each file, parse the code into an AST
-# for each AST, extract the i18n strings
-strings = []
-folders = ["fish_speech", "tools"]
-# for filename in glob.iglob("**/*.py", recursive=True):
-for folder in folders:
-    for f in Path(folder).rglob("*.py"):
-        code = f.read_text(encoding="utf-8")
-        if "i18n(" in code:
-            tree = ast.parse(code)
-            i18n_strings = extract_i18n_strings(tree)
-            logger.info(f"Found {len(i18n_strings)} i18n strings in {f}")
-            strings.extend(i18n_strings)
-code_keys = set(strings)
-logger.info(f"Total unique: {len(code_keys)}")
-standard_file = I18N_FILE_PATH / f"{DEFAULT_LANGUAGE}.json"
-with open(standard_file, "r", encoding="utf-8") as f:
-    standard_data = json.load(f, object_pairs_hook=OrderedDict)
-standard_keys = set(standard_data.keys())
-# Define the standard file name
-unused_keys = standard_keys - code_keys
-logger.info(f"Found {len(unused_keys)} unused keys in {standard_file}")
-for unused_key in unused_keys:
-    logger.info(f"\t{unused_key}")
-missing_keys = code_keys - standard_keys
-logger.info(f"Found {len(missing_keys)} missing keys in {standard_file}")
-for missing_key in missing_keys:
-    logger.info(f"\t{missing_key}")
-code_keys_dict = OrderedDict()
-for s in strings:
-    code_keys_dict[s] = s
-# write back
-with open(standard_file, "w", encoding="utf-8") as f:
-    json.dump(code_keys_dict, f, ensure_ascii=False, indent=4, sort_keys=True)
-    f.write("\n")
-logger.info(f"Updated {standard_file}")
-# Define the standard file name
-standard_file = I18N_FILE_PATH / f"{DEFAULT_LANGUAGE}.json"
-# Find all JSON files in the directory
-dir_path = I18N_FILE_PATH
-languages = [f for f in dir_path.glob("*.json") if f.stem != DEFAULT_LANGUAGE]
-# Load the standard file
-with open(standard_file, "r", encoding="utf-8") as f:
-    standard_data = json.load(f, object_pairs_hook=OrderedDict)
-# Loop through each language file
-for lang_file in languages:
-    # Load the language file
-    with open(lang_file, "r", encoding="utf-8") as f:
-        lang_data = json.load(f, object_pairs_hook=OrderedDict)
-    # Find the difference between the language file and the standard file
-    diff = set(standard_data.keys()) - set(lang_data.keys())
-    miss = set(lang_data.keys()) - set(standard_data.keys())
-    # Add any missing keys to the language file
-    for key in diff:
-        lang_data[key] = "#!" + key
-        logger.info(f"Added missing key: {key} to {lang_file}")
-    # Del any extra keys to the language file
-    for key in miss:
-        del lang_data[key]
-        logger.info(f"Del extra key: {key} from {lang_file}")
-    # Sort the keys of the language file to match the order of the standard file
-    lang_data = OrderedDict(
-        sorted(lang_data.items(), key=lambda x: list(standard_data.keys()).index(x[0]))
-    )
-    # Save the updated language file
-    with open(lang_file, "w", encoding="utf-8") as f:
-        json.dump(lang_data, f, ensure_ascii=False, indent=4, sort_keys=True)
-        f.write("\n")
-    logger.info(f"Updated {lang_file}")
-logger.info("Done")

+import ast
+import glob
+import json
+from collections import OrderedDict
+from pathlib import Path
+from loguru import logger
+from .core import DEFAULT_LANGUAGE, I18N_FILE_PATH
+def extract_i18n_strings(node):
+    i18n_strings = []
+    if (
+        isinstance(node, ast.Call)
+        and isinstance(node.func, ast.Name)
+        and node.func.id == "i18n"
+    ):
+        for arg in node.args:
+            if isinstance(arg, ast.Str):
+                i18n_strings.append(arg.s)
+    for child_node in ast.iter_child_nodes(node):
+        i18n_strings.extend(extract_i18n_strings(child_node))
+    return i18n_strings
+# scan the directory for all .py files (recursively)
+# for each file, parse the code into an AST
+# for each AST, extract the i18n strings
+strings = []
+folders = ["fish_speech", "tools"]
+# for filename in glob.iglob("**/*.py", recursive=True):
+for folder in folders:
+    for f in Path(folder).rglob("*.py"):
+        code = f.read_text(encoding="utf-8")
+        if "i18n(" in code:
+            tree = ast.parse(code)
+            i18n_strings = extract_i18n_strings(tree)
+            logger.info(f"Found {len(i18n_strings)} i18n strings in {f}")
+            strings.extend(i18n_strings)
+code_keys = set(strings)
+logger.info(f"Total unique: {len(code_keys)}")
+standard_file = I18N_FILE_PATH / f"{DEFAULT_LANGUAGE}.json"
+with open(standard_file, "r", encoding="utf-8") as f:
+    standard_data = json.load(f, object_pairs_hook=OrderedDict)
+standard_keys = set(standard_data.keys())
+# Define the standard file name
+unused_keys = standard_keys - code_keys
+logger.info(f"Found {len(unused_keys)} unused keys in {standard_file}")
+for unused_key in unused_keys:
+    logger.info(f"\t{unused_key}")
+missing_keys = code_keys - standard_keys
+logger.info(f"Found {len(missing_keys)} missing keys in {standard_file}")
+for missing_key in missing_keys:
+    logger.info(f"\t{missing_key}")
+code_keys_dict = OrderedDict()
+for s in strings:
+    code_keys_dict[s] = s
+# write back
+with open(standard_file, "w", encoding="utf-8") as f:
+    json.dump(code_keys_dict, f, ensure_ascii=False, indent=4, sort_keys=True)
+    f.write("\n")
+logger.info(f"Updated {standard_file}")
+# Define the standard file name
+standard_file = I18N_FILE_PATH / f"{DEFAULT_LANGUAGE}.json"
+# Find all JSON files in the directory
+dir_path = I18N_FILE_PATH
+languages = [f for f in dir_path.glob("*.json") if f.stem != DEFAULT_LANGUAGE]
+# Load the standard file
+with open(standard_file, "r", encoding="utf-8") as f:
+    standard_data = json.load(f, object_pairs_hook=OrderedDict)
+# Loop through each language file
+for lang_file in languages:
+    # Load the language file
+    with open(lang_file, "r", encoding="utf-8") as f:
+        lang_data = json.load(f, object_pairs_hook=OrderedDict)
+    # Find the difference between the language file and the standard file
+    diff = set(standard_data.keys()) - set(lang_data.keys())
+    miss = set(lang_data.keys()) - set(standard_data.keys())
+    # Add any missing keys to the language file
+    for key in diff:
+        lang_data[key] = "#!" + key
+        logger.info(f"Added missing key: {key} to {lang_file}")
+    # Del any extra keys to the language file
+    for key in miss:
+        del lang_data[key]
+        logger.info(f"Del extra key: {key} from {lang_file}")
+    # Sort the keys of the language file to match the order of the standard file
+    lang_data = OrderedDict(
+        sorted(lang_data.items(), key=lambda x: list(standard_data.keys()).index(x[0]))
+    )
+    # Save the updated language file
+    with open(lang_file, "w", encoding="utf-8") as f:
+        json.dump(lang_data, f, ensure_ascii=False, indent=4, sort_keys=True)
+        f.write("\n")
+    logger.info(f"Updated {lang_file}")
+logger.info("Done")

fish_speech/inference_engine/__init__.py ADDED Viewed

	@@ -0,0 +1,192 @@

+import gc
+import queue
+from typing import Generator
+import numpy as np
+import torch
+from loguru import logger
+from fish_speech.inference_engine.reference_loader import ReferenceLoader
+from fish_speech.inference_engine.utils import InferenceResult, wav_chunk_header
+from fish_speech.inference_engine.vq_manager import VQManager
+from fish_speech.models.dac.modded_dac import DAC
+from fish_speech.models.text2semantic.inference import (
+    GenerateRequest,
+    GenerateResponse,
+    WrappedGenerateResponse,
+)
+from fish_speech.utils import autocast_exclude_mps, set_seed
+from fish_speech.utils.schema import ServeTTSRequest
+class TTSInferenceEngine(ReferenceLoader, VQManager):
+    def __init__(
+        self,
+        llama_queue: queue.Queue,
+        decoder_model: DAC,
+        precision: torch.dtype,
+        compile: bool,
+    ) -> None:
+        super().__init__()
+        self.llama_queue = llama_queue
+        self.decoder_model = decoder_model
+        self.precision = precision
+        self.compile = compile
+    @torch.inference_mode()
+    def inference(self, req: ServeTTSRequest) -> Generator[InferenceResult, None, None]:
+        """
+        Main inference function:
+        - Loads the reference audio and text.
+        - Calls the LLAMA model for inference.
+        - Decodes the VQ tokens to audio.
+        """
+        ref_id: str | None = req.reference_id
+        prompt_tokens, prompt_texts = [], []
+        # Load the reference audio and text based on id or hash
+        if ref_id is not None:
+            prompt_tokens, prompt_texts = self.load_by_id(ref_id, req.use_memory_cache)
+        elif req.references:
+            prompt_tokens, prompt_texts = self.load_by_hash(
+                req.references, req.use_memory_cache
+            )
+        # Set the random seed if provided
+        if req.seed is not None:
+            set_seed(req.seed)
+            logger.warning(f"set seed: {req.seed}")
+        # Get the symbolic tokens from the LLAMA model
+        response_queue = self.send_Llama_request(req, prompt_tokens, prompt_texts)
+        # Get the sample rate from the decoder model
+        if hasattr(self.decoder_model, "spec_transform"):
+            sample_rate = self.decoder_model.spec_transform.sample_rate
+        else:
+            sample_rate = self.decoder_model.sample_rate
+        # If streaming, send the header
+        if req.streaming:
+            yield InferenceResult(
+                code="header",
+                audio=(
+                    sample_rate,
+                    np.array(wav_chunk_header(sample_rate=sample_rate)),
+                ),
+                error=None,
+            )
+        segments = []
+        while True:
+            # Get the response from the LLAMA model
+            wrapped_result: WrappedGenerateResponse = response_queue.get()
+            if wrapped_result.status == "error":
+                yield InferenceResult(
+                    code="error",
+                    audio=None,
+                    error=(
+                        wrapped_result.response
+                        if isinstance(wrapped_result.response, Exception)
+                        else Exception("Unknown error")
+                    ),
+                )
+                break
+            # Check the response type
+            if not isinstance(wrapped_result.response, GenerateResponse):
+                raise TypeError(
+                    "Expected GenerateResponse, got {type(wrapped_result.response).__name__}"
+                )
+            result: GenerateResponse = wrapped_result.response
+            if result.action != "next":
+                segment = self.get_audio_segment(result)
+                if req.streaming:  # Used only by the API server
+                    yield InferenceResult(
+                        code="segment",
+                        audio=(sample_rate, segment),
+                        error=None,
+                    )
+                segments.append(segment)
+            else:
+                break
+        # Clean up the memory
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+            gc.collect()
+        # Edge case: no audio generated
+        if len(segments) == 0:
+            yield InferenceResult(
+                code="error",
+                audio=None,
+                error=RuntimeError("No audio generated, please check the input text."),
+            )
+        else:
+            # Streaming or not, return the final audio
+            audio = np.concatenate(segments, axis=0)
+            yield InferenceResult(
+                code="final",
+                audio=(sample_rate, audio),
+                error=None,
+            )
+        return None
+    def send_Llama_request(
+        self, req: ServeTTSRequest, prompt_tokens: list, prompt_texts: list
+    ) -> queue.Queue:
+        """
+        Send a request to the LLAMA model to generate the symbolic tokens.
+        """
+        # Prepare the request
+        request = dict(
+            device=self.decoder_model.device,
+            max_new_tokens=req.max_new_tokens,
+            text=req.text,
+            top_p=req.top_p,
+            repetition_penalty=req.repetition_penalty,
+            temperature=req.temperature,
+            compile=self.compile,
+            iterative_prompt=req.chunk_length > 0,
+            chunk_length=req.chunk_length,
+            prompt_tokens=prompt_tokens,
+            prompt_text=prompt_texts,
+        )
+        # Create a queue to get the response
+        response_queue = queue.Queue()
+        # Send the request to the LLAMA model
+        self.llama_queue.put(
+            GenerateRequest(
+                request=request,
+                response_queue=response_queue,
+            )
+        )
+        return response_queue
+    def get_audio_segment(self, result: GenerateResponse) -> np.ndarray:
+        """
+        Decode the VQ tokens to audio.
+        """
+        # Don't use autocast on MPS devices
+        with autocast_exclude_mps(
+            device_type=self.decoder_model.device.type, dtype=self.precision
+        ):
+            # Decode the symbolic tokens to audio
+            segment = self.decode_vq_tokens(codes=result.codes)
+        # Convert the audio to numpy
+        return segment.float().cpu().numpy()

fish_speech/inference_engine/reference_loader.py ADDED Viewed

	@@ -0,0 +1,130 @@

+import io
+from hashlib import sha256
+from pathlib import Path
+from typing import Callable, Literal, Tuple
+import torch
+import torchaudio
+from loguru import logger
+from fish_speech.models.dac.modded_dac import DAC
+from fish_speech.utils.file import (
+    AUDIO_EXTENSIONS,
+    audio_to_bytes,
+    list_files,
+    read_ref_text,
+)
+from fish_speech.utils.schema import ServeReferenceAudio
+class ReferenceLoader:
+    def __init__(self) -> None:
+        """
+        Component of the TTSInferenceEngine class.
+        Loads and manages the cache for the reference audio and text.
+        """
+        self.ref_by_id: dict = {}
+        self.ref_by_hash: dict = {}
+        # Make Pylance happy (attribut/method not defined...)
+        self.decoder_model: DAC
+        self.encode_reference: Callable
+        # Define the torchaudio backend
+        backends = torchaudio.list_audio_backends()
+        if "ffmpeg" in backends:
+            self.backend = "ffmpeg"
+        else:
+            self.backend = "soundfile"
+    def load_by_id(
+        self,
+        id: str,
+        use_cache: Literal["on", "off"],
+    ) -> Tuple:
+        # Load the references audio and text by id
+        ref_folder = Path("references") / id
+        ref_folder.mkdir(parents=True, exist_ok=True)
+        ref_audios = list_files(
+            ref_folder, AUDIO_EXTENSIONS, recursive=True, sort=False
+        )
+        if use_cache == "off" or id not in self.ref_by_id:
+            # If the references are not already loaded, encode them
+            prompt_tokens = [
+                self.encode_reference(
+                    # decoder_model=self.decoder_model,
+                    reference_audio=audio_to_bytes(str(ref_audio)),
+                    enable_reference_audio=True,
+                )
+                for ref_audio in ref_audios
+            ]
+            prompt_texts = [
+                read_ref_text(str(ref_audio.with_suffix(".lab")))
+                for ref_audio in ref_audios
+            ]
+            self.ref_by_id[id] = (prompt_tokens, prompt_texts)
+        else:
+            # Reuse already encoded references
+            logger.info("Use same references")
+            prompt_tokens, prompt_texts = self.ref_by_id[id]
+        return prompt_tokens, prompt_texts
+    def load_by_hash(
+        self,
+        references: list[ServeReferenceAudio],
+        use_cache: Literal["on", "off"],
+    ) -> Tuple:
+        # Load the references audio and text by hash
+        audio_hashes = [sha256(ref.audio).hexdigest() for ref in references]
+        cache_used = False
+        prompt_tokens, prompt_texts = [], []
+        for i, ref in enumerate(references):
+            if use_cache == "off" or audio_hashes[i] not in self.ref_by_hash:
+                # If the references are not already loaded, encode them
+                prompt_tokens.append(
+                    self.encode_reference(
+                        reference_audio=ref.audio,
+                        enable_reference_audio=True,
+                    )
+                )
+                prompt_texts.append(ref.text)
+                self.ref_by_hash[audio_hashes[i]] = (prompt_tokens, prompt_texts)
+            else:
+                # Reuse already encoded references
+                prompt_tokens, prompt_texts = self.ref_by_hash[audio_hashes[i]]
+                cache_used = True
+        if cache_used:
+            logger.info("Use same references")
+        return prompt_tokens, prompt_texts
+    def load_audio(self, reference_audio, sr):
+        """
+        Load the audio data from a file or bytes.
+        """
+        if len(reference_audio) > 255 or not Path(reference_audio).exists():
+            audio_data = reference_audio
+            reference_audio = io.BytesIO(audio_data)
+        waveform, original_sr = torchaudio.load(reference_audio, backend=self.backend)
+        if waveform.shape[0] > 1:
+            waveform = torch.mean(waveform, dim=0, keepdim=True)
+        if original_sr != sr:
+            resampler = torchaudio.transforms.Resample(
+                orig_freq=original_sr, new_freq=sr
+            )
+            waveform = resampler(waveform)
+        audio = waveform.squeeze().numpy()
+        return audio

fish_speech/inference_engine/utils.py ADDED Viewed

	@@ -0,0 +1,29 @@

+import io
+import wave
+from dataclasses import dataclass
+from typing import Literal, Optional, Tuple
+import numpy as np
+@dataclass
+class InferenceResult:
+    code: Literal["header", "segment", "error", "final"]
+    audio: Optional[Tuple[int, np.ndarray]]
+    error: Optional[Exception]
+def wav_chunk_header(
+    sample_rate: int = 44100, bit_depth: int = 16, channels: int = 1
+) -> bytes:
+    buffer = io.BytesIO()
+    with wave.open(buffer, "wb") as wav_file:
+        wav_file.setnchannels(channels)
+        wav_file.setsampwidth(bit_depth // 8)
+        wav_file.setframerate(sample_rate)
+    wav_header_bytes = buffer.getvalue()
+    buffer.close()
+    return wav_header_bytes

fish_speech/inference_engine/vq_manager.py ADDED Viewed

	@@ -0,0 +1,59 @@

+from typing import Callable
+import torch
+from loguru import logger
+from fish_speech.models.dac.modded_dac import DAC
+class VQManager:
+    def __init__(self):
+        # Make Pylance happy (attribut/method not defined...)
+        self.decoder_model: DAC
+        self.load_audio: Callable
+    def decode_vq_tokens(self, codes):
+        feature_lengths = torch.tensor(
+            [codes.shape[1]], device=self.decoder_model.device
+        )
+        logger.info(f"VQ features: {codes.shape}")
+        if isinstance(self.decoder_model, DAC):
+            return self.decoder_model.decode(
+                indices=codes[None],
+                feature_lengths=feature_lengths,
+            )[0].squeeze()
+        raise ValueError(f"Unknown model type: {type(self.decoder_model)}")
+    def encode_reference(self, reference_audio, enable_reference_audio):
+        if enable_reference_audio and reference_audio is not None:
+            # Load audios, and prepare basic info here
+            if hasattr(self.decoder_model, "spec_transform"):
+                sample_rate = self.decoder_model.spec_transform.sample_rate
+            else:
+                sample_rate = self.decoder_model.sample_rate
+            reference_audio_content = self.load_audio(reference_audio, sample_rate)
+            audios = torch.from_numpy(reference_audio_content).to(
+                self.decoder_model.device
+            )[None, None, :]
+            audio_lengths = torch.tensor(
+                [audios.shape[2]], device=self.decoder_model.device, dtype=torch.long
+            )
+            logger.info(
+                f"Loaded audio with {audios.shape[2] / sample_rate:.2f} seconds"
+            )
+            # VQ Encoder
+            if isinstance(self.decoder_model, DAC):
+                prompt_tokens = self.decoder_model.encode(audios, audio_lengths)[0][0]
+                logger.info(f"Encoded prompt: {prompt_tokens.shape}")
+            else:
+                raise ValueError(f"Unknown model type: {type(self.decoder_model)}")
+        else:
+            prompt_tokens = None
+            logger.info("No reference audio provided")
+        return prompt_tokens

fish_speech/models/dac/__init__.py ADDED Viewed

File without changes

fish_speech/models/dac/inference.py ADDED Viewed

	@@ -0,0 +1,123 @@

+from pathlib import Path
+import click
+import hydra
+import numpy as np
+import pyrootutils
+import soundfile as sf
+import torch
+import torchaudio
+from hydra import compose, initialize
+from hydra.utils import instantiate
+from loguru import logger
+from omegaconf import OmegaConf
+pyrootutils.setup_root(__file__, indicator=".project-root", pythonpath=True)
+from fish_speech.utils.file import AUDIO_EXTENSIONS
+# register eval resolver
+OmegaConf.register_new_resolver("eval", eval)
+def load_model(config_name, checkpoint_path, device="cuda"):
+    hydra.core.global_hydra.GlobalHydra.instance().clear()
+    with initialize(version_base="1.3", config_path="../../configs"):
+        cfg = compose(config_name=config_name)
+    model = instantiate(cfg)
+    state_dict = torch.load(
+        checkpoint_path, map_location=device, mmap=True, weights_only=True
+    )
+    if "state_dict" in state_dict:
+        state_dict = state_dict["state_dict"]
+    if any("generator" in k for k in state_dict):
+        state_dict = {
+            k.replace("generator.", ""): v
+            for k, v in state_dict.items()
+            if "generator." in k
+        }
+    result = model.load_state_dict(state_dict, strict=False, assign=True)
+    model.eval()
+    model.to(device)
+    logger.info(f"Loaded model: {result}")
+    return model
+@torch.no_grad()
+@click.command()
+@click.option(
+    "--input-path",
+    "-i",
+    default="test.wav",
+    type=click.Path(exists=True, path_type=Path),
+)
+@click.option(
+    "--output-path", "-o", default="fake.wav", type=click.Path(path_type=Path)
+)
+@click.option("--config-name", default="modded_dac_vq")
+@click.option(
+    "--checkpoint-path",
+    default="checkpoints/openaudio-s1-mini/codec.pth",
+)
+@click.option(
+    "--device",
+    "-d",
+    default="cuda",
+)
+def main(input_path, output_path, config_name, checkpoint_path, device):
+    model = load_model(config_name, checkpoint_path, device=device)
+    if input_path.suffix in AUDIO_EXTENSIONS:
+        logger.info(f"Processing in-place reconstruction of {input_path}")
+        # Load audio
+        audio, sr = torchaudio.load(str(input_path))
+        if audio.shape[0] > 1:
+            audio = audio.mean(0, keepdim=True)
+        audio = torchaudio.functional.resample(audio, sr, model.sample_rate)
+        audios = audio[None].to(device)
+        logger.info(
+            f"Loaded audio with {audios.shape[2] / model.sample_rate:.2f} seconds"
+        )
+        # VQ Encoder
+        audio_lengths = torch.tensor([audios.shape[2]], device=device, dtype=torch.long)
+        indices, indices_lens = model.encode(audios, audio_lengths)
+        if indices.ndim == 3:
+            indices = indices[0]
+        logger.info(f"Generated indices of shape {indices.shape}")
+        # Save indices
+        np.save(output_path.with_suffix(".npy"), indices.cpu().numpy())
+    elif input_path.suffix == ".npy":
+        logger.info(f"Processing precomputed indices from {input_path}")
+        indices = np.load(input_path)
+        indices = torch.from_numpy(indices).to(device).long()
+        assert indices.ndim == 2, f"Expected 2D indices, got {indices.ndim}"
+        indices_lens = torch.tensor([indices.shape[1]], device=device, dtype=torch.long)
+    else:
+        raise ValueError(f"Unknown input type: {input_path}")
+    # Restore
+    fake_audios, audio_lengths = model.decode(indices, indices_lens)
+    audio_time = fake_audios.shape[-1] / model.sample_rate
+    logger.info(
+        f"Generated audio of shape {fake_audios.shape}, equivalent to {audio_time:.2f} seconds from {indices.shape[1]} features, features/second: {indices.shape[1] / audio_time:.2f}"
+    )
+    # Save audio
+    fake_audio = fake_audios[0, 0].float().cpu().numpy()
+    sf.write(output_path, fake_audio, model.sample_rate)
+    logger.info(f"Saved audio to {output_path}")
+if __name__ == "__main__":
+    main()

fish_speech/models/dac/modded_dac.py ADDED Viewed

	@@ -0,0 +1,1024 @@

+import math
+import typing as tp
+from dataclasses import dataclass
+from typing import List, Optional, Union
+import hydra
+import librosa
+import numpy as np
+import soundfile as sf
+import torch
+from audiotools import AudioSignal
+from audiotools.ml import BaseModel
+from dac.model.base import CodecMixin
+from dac.nn.layers import Snake1d, WNConv1d, WNConvTranspose1d
+from omegaconf import OmegaConf
+from torch import Tensor, nn
+from torch.nn import functional as F
+from torch.nn.utils.parametrizations import weight_norm
+from torch.nn.utils.parametrize import remove_parametrizations
+@dataclass
+class VQResult:
+    z: torch.Tensor
+    codes: torch.Tensor
+    latents: torch.Tensor
+    codebook_loss: torch.Tensor
+    commitment_loss: torch.Tensor
+    semantic_distill_z: torch.Tensor | None = None
+def find_multiple(n: int, k: int) -> int:
+    if n % k == 0:
+        return n
+    return n + k - (n % k)
+@dataclass
+class ModelArgs:
+    block_size: int = 2048
+    n_layer: int = 8
+    n_head: int = 8
+    dim: int = 512
+    intermediate_size: int = 1536
+    n_local_heads: int = -1
+    head_dim: int = 64
+    rope_base: float = 10000
+    norm_eps: float = 1e-5
+    dropout_rate: float = 0.1
+    attn_dropout_rate: float = 0.1
+    channels_first: bool = True  # to be compatible with conv1d input/output
+    pos_embed_type: str = "rope"  # can be "rope" or "conformer"
+    max_relative_position: int = 128  # for conformer-style relative position embedding
+    def __post_init__(self):
+        if self.n_local_heads == -1:
+            self.n_local_heads = self.n_head
+        if self.intermediate_size is None:
+            hidden_dim = 4 * self.dim
+            n_hidden = int(2 * hidden_dim / 3)
+            self.intermediate_size = find_multiple(n_hidden, 256)
+        assert self.pos_embed_type in [
+            "rope",
+            "conformer",
+        ], "pos_embed_type must be either 'rope' or 'conformer'"
+class KVCache(nn.Module):
+    def __init__(
+        self, max_batch_size, max_seq_length, n_heads, head_dim, dtype=torch.bfloat16
+    ):
+        super().__init__()
+        cache_shape = (max_batch_size, n_heads, max_seq_length, head_dim)
+        self.register_buffer("k_cache", torch.zeros(cache_shape, dtype=dtype))
+        self.register_buffer("v_cache", torch.zeros(cache_shape, dtype=dtype))
+    def update(self, input_pos, k_val, v_val):
+        # input_pos: [S], k_val: [B, H, S, D]
+        assert input_pos.shape[0] == k_val.shape[2]
+        k_out = self.k_cache
+        v_out = self.v_cache
+        k_out[:, :, input_pos] = k_val
+        v_out[:, :, input_pos] = v_val
+        return (
+            k_out[:, :, : input_pos.max() + 1, :],
+            v_out[:, :, : input_pos.max() + 1, :],
+        )
+    def clear_cache(self, prompt_len):
+        self.k_cache[:, :, prompt_len:, :].fill_(0)
+        self.v_cache[:, :, prompt_len:, :].fill_(0)
+class Transformer(nn.Module):
+    def __init__(self, config: ModelArgs) -> None:
+        super().__init__()
+        self.config = config
+        self.layers = nn.ModuleList(
+            TransformerBlock(config) for _ in range(config.n_layer)
+        )
+        self.norm = RMSNorm(config.dim, eps=config.norm_eps)
+        # Only compute RoPE frequencies if using RoPE
+        if config.pos_embed_type == "rope":
+            freqs_cis = precompute_freqs_cis(
+                self.config.block_size, self.config.head_dim, self.config.rope_base
+            )
+            self.register_buffer("freqs_cis", freqs_cis)
+        else:
+            self.register_buffer("freqs_cis", None)
+        causal_mask = torch.tril(
+            torch.ones(self.config.block_size, self.config.block_size, dtype=torch.bool)
+        )
+        self.register_buffer("causal_mask", causal_mask)
+        self.max_batch_size = -1
+        self.max_seq_length = -1
+        self.use_kv_cache = False
+    def setup_caches(self, max_batch_size, max_seq_length):
+        """
+        This method will only be called during inference when using KV cache.
+        """
+        head_dim = self.config.dim // self.config.n_head
+        max_seq_length = find_multiple(max_seq_length, 8)
+        self.max_seq_length = max_seq_length
+        self.max_batch_size = max_batch_size
+        dtype = self.norm.weight.dtype
+        device = self.norm.weight.device
+        for b in self.layers:
+            b.attention.kv_cache = KVCache(
+                max_batch_size,
+                max_seq_length,
+                self.config.n_local_heads,
+                head_dim,
+                dtype,
+            ).to(device)
+        self.use_kv_cache = True
+    def forward(
+        self,
+        x: Tensor,
+        input_pos: Optional[Tensor] = None,
+        mask: Optional[Tensor] = None,
+    ) -> Tensor:
+        if self.config.pos_embed_type == "rope":
+            assert (
+                self.freqs_cis is not None
+            ), "RoPE frequencies must be initialized for RoPE positional embedding"
+            freqs_cis = self.freqs_cis[input_pos]
+        else:
+            freqs_cis = None
+        if mask is None:  # in case of non-causal model
+            if not self.training and self.use_kv_cache:
+                mask = self.causal_mask[None, None, input_pos]
+                mask = mask[..., : input_pos.max() + 1]
+            else:
+                mask = self.causal_mask[None, None, input_pos]
+                mask = mask[..., input_pos]
+        for i, layer in enumerate(self.layers):
+            x = layer(x, input_pos, freqs_cis, mask)
+        x = self.norm(x)
+        return x
+class TransformerBlock(nn.Module):
+    def __init__(self, config: ModelArgs) -> None:
+        super().__init__()
+        self.attention = Attention(config)
+        self.feed_forward = FeedForward(config)
+        self.ffn_norm = RMSNorm(config.dim, eps=config.norm_eps)
+        self.attention_norm = RMSNorm(config.dim, eps=config.norm_eps)
+        self.attention_layer_scale = LayerScale(config.dim, inplace=True)
+        self.ffn_layer_scale = LayerScale(config.dim, inplace=True)
+    def forward(
+        self,
+        x: Tensor,
+        input_pos: Tensor,
+        freqs_cis: Tensor,
+        mask: Tensor,
+    ) -> Tensor:
+        h = x + self.attention_layer_scale(
+            self.attention(self.attention_norm(x), freqs_cis, mask, input_pos)
+        )
+        out = h + self.ffn_layer_scale(self.feed_forward(self.ffn_norm(h)))
+        return out
+class Attention(nn.Module):
+    def __init__(self, config: ModelArgs):
+        super().__init__()
+        assert config.dim % config.n_head == 0
+        total_head_dim = (config.n_head + 2 * config.n_local_heads) * config.head_dim
+        # key, query, value projections for all heads, but in a batch
+        self.wqkv = nn.Linear(config.dim, total_head_dim, bias=False)
+        self.wo = nn.Linear(config.head_dim * config.n_head, config.dim, bias=False)
+        self.kv_cache = None
+        self.n_head = config.n_head
+        self.head_dim = config.head_dim
+        self.n_local_heads = config.n_local_heads
+        self.dim = config.dim
+        self.attn_dropout_rate = config.attn_dropout_rate
+        self.pos_embed_type = config.pos_embed_type
+        # Add relative position embedding for conformer-style
+        if self.pos_embed_type == "conformer":
+            self.max_relative_position = config.max_relative_position
+            num_pos_embeddings = 2 * config.max_relative_position + 1
+            self.rel_pos_embeddings = nn.Parameter(
+                torch.zeros(num_pos_embeddings, self.head_dim)
+            )
+            nn.init.normal_(self.rel_pos_embeddings, mean=0.0, std=0.02)
+    def _compute_conformer_pos_scores(self, q: Tensor, seqlen: int) -> Tensor:
+        # q: [B, H, S, D]
+        # Returns: [B, H, S, S]
+        positions = torch.arange(seqlen, device=q.device)
+        relative_positions = positions.unsqueeze(1) - positions.unsqueeze(0)  # [S, S]
+        relative_positions = torch.clamp(
+            relative_positions + self.max_relative_position,
+            0,
+            2 * self.max_relative_position,
+        )
+        rel_embeddings = self.rel_pos_embeddings[relative_positions]  # [S, S, D]
+        # Compute attention scores with relative position embeddings
+        q = q.transpose(1, 2)  # [B, S, H, D]
+        rel_logits = torch.matmul(q, rel_embeddings.transpose(-2, -1))  # [B, S, H, S]
+        rel_logits = rel_logits.transpose(1, 2)  # [B, H, S, S]
+        return rel_logits
+    def forward(
+        self,
+        x: Tensor,
+        freqs_cis: Tensor,
+        mask: Tensor,
+        input_pos: Optional[Tensor] = None,
+    ) -> Tensor:
+        bsz, seqlen, _ = x.shape
+        kv_size = self.n_local_heads * self.head_dim
+        q, k, v = self.wqkv(x).split([kv_size, kv_size, kv_size], dim=-1)
+        context_seqlen = seqlen
+        q = q.view(bsz, seqlen, self.n_head, self.head_dim)
+        k = k.view(bsz, context_seqlen, self.n_local_heads, self.head_dim)
+        v = v.view(bsz, context_seqlen, self.n_local_heads, self.head_dim)
+        if self.pos_embed_type == "rope":
+            q = apply_rotary_emb(q, freqs_cis)
+            k = apply_rotary_emb(k, freqs_cis)
+        q, k, v = map(lambda x: x.transpose(1, 2), (q, k, v))
+        if self.kv_cache is not None:
+            k, v = self.kv_cache.update(input_pos, k, v)
+        k = k.repeat_interleave(self.n_head // self.n_local_heads, dim=1)
+        v = v.repeat_interleave(self.n_head // self.n_local_heads, dim=1)
+        if self.pos_embed_type == "conformer":
+            # Compute attention scores
+            scale = 1.0 / math.sqrt(self.head_dim)
+            scores = torch.matmul(q, k.transpose(-2, -1)) * scale
+            # Add relative position embeddings for conformer-style
+            rel_scores = self._compute_conformer_pos_scores(q, seqlen)
+            scores = scores + rel_scores
+            # Apply attention
+            if mask is not None:
+                scores = scores.masked_fill(~mask, float("-inf"))
+            attn = F.softmax(scores, dim=-1)
+            if self.attn_dropout_rate > 0 and self.training:
+                attn = F.dropout(attn, p=self.attn_dropout_rate)
+            y = torch.matmul(attn, v)
+        else:
+            y = F.scaled_dot_product_attention(
+                q,
+                k,
+                v,
+                dropout_p=self.attn_dropout_rate if self.training else 0.0,
+                attn_mask=mask,
+            )
+            # is_causal=True)
+        y = (
+            y.transpose(1, 2)
+            .contiguous()
+            .view(bsz, seqlen, self.head_dim * self.n_head)
+        )
+        y = self.wo(y)
+        return y
+class FeedForward(nn.Module):
+    def __init__(self, config: ModelArgs) -> None:
+        super().__init__()
+        self.w1 = nn.Linear(config.dim, config.intermediate_size, bias=False)
+        self.w3 = nn.Linear(config.dim, config.intermediate_size, bias=False)
+        self.w2 = nn.Linear(config.intermediate_size, config.dim, bias=False)
+        self.dropout = nn.Dropout(config.dropout_rate)
+    def forward(self, x: Tensor) -> Tensor:
+        return self.w2(self.dropout(F.silu(self.w1(x)) * self.w3(x)))
+class RMSNorm(nn.Module):
+    def __init__(self, dim: int, eps: float = 1e-5):
+        super().__init__()
+        self.eps = eps
+        self.weight = nn.Parameter(torch.ones(dim))
+    def _norm(self, x):
+        return x * torch.rsqrt(torch.mean(x * x, dim=-1, keepdim=True) + self.eps)
+    def forward(self, x: Tensor) -> Tensor:
+        output = self._norm(x.float()).type_as(x)
+        return output * self.weight
+class LayerScale(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        init_values: Union[float, Tensor] = 1e-2,
+        inplace: bool = False,
+    ) -> None:
+        super().__init__()
+        self.inplace = inplace
+        self.gamma = nn.Parameter(init_values * torch.ones(dim))
+    def forward(self, x: Tensor) -> Tensor:
+        return x.mul_(self.gamma) if self.inplace else x * self.gamma
+class WindowLimitedTransformer(Transformer):
+    """
+    Transformer with window limited attention, causal.
+    """
+    def __init__(
+        self,
+        config: ModelArgs,
+        input_dim: int = 512,
+        window_size: Optional[int] = None,
+        causal: bool = True,
+        look_ahead_conv: nn.Module = None,
+    ):
+        super().__init__(config)
+        self.window_size = window_size
+        self.causal = causal
+        self.channels_first = config.channels_first
+        self.look_ahead_conv = (
+            look_ahead_conv if look_ahead_conv is not None else nn.Identity()
+        )
+        self.input_proj = (
+            nn.Linear(input_dim, config.dim)
+            if input_dim != config.dim
+            else nn.Identity()
+        )
+        self.output_proj = (
+            nn.Linear(config.dim, input_dim)
+            if input_dim != config.dim
+            else nn.Identity()
+        )
+    def make_window_limited_mask(
+        self,
+        max_length: int,
+        x_lens: Optional[Tensor] = None,
+    ) -> Tensor:
+        """
+        Make mask to form window limited attention.
+        """
+        if self.causal:
+            mask = torch.tril(torch.ones(max_length, max_length))
+            row_indices = torch.arange(max_length).view(-1, 1)
+            window_size = self.window_size or max_length
+            valid_range = (row_indices - window_size + 1).clamp(min=0)
+            column_indices = torch.arange(max_length)
+            mask = (column_indices >= valid_range) & mask.bool()
+        else:
+            raise NotImplementedError
+        mask = mask.bool()[None, None]
+        return mask
+    def make_mask(
+        self,
+        max_length: int,
+        x_lens: Optional[Tensor] = None,
+    ) -> Tensor:
+        """
+        Make ordinary mask if window size is not specified.
+        """
+        if self.causal:
+            mask = torch.tril(torch.ones(max_length, max_length))
+        else:
+            mask = torch.ones(max_length, max_length)
+            mask = mask.bool()[None, None]
+            for i, x_len in enumerate(x_lens):
+                mask[:x_len, i] = 0
+        mask = mask.bool()[None, None]
+        return mask
+    def forward(
+        self,
+        x: Tensor,
+        x_lens: Optional[Tensor] = None,
+    ) -> Tensor:
+        if self.channels_first:
+            x = x.transpose(1, 2)
+        x = self.input_proj(x)  # (B, T, D)
+        x = self.look_ahead_conv(x)
+        input_pos = torch.arange(x.shape[1], device=x.device)
+        # construct mask to form window limited attention
+        max_length = x.shape[1]
+        if self.window_size is not None:
+            mask = self.make_window_limited_mask(max_length, x_lens)
+        else:
+            mask = self.make_mask(max_length, x_lens)
+        mask = mask.to(x.device)
+        x = super().forward(x, input_pos, mask)
+        x = self.output_proj(x)  # (B, T, D)
+        if self.channels_first:
+            x = x.transpose(1, 2)
+        return x
+def precompute_freqs_cis(
+    seq_len: int, n_elem: int, base: int = 10000, dtype: torch.dtype = torch.bfloat16
+) -> Tensor:
+    freqs = 1.0 / (
+        base ** (torch.arange(0, n_elem, 2)[: (n_elem // 2)].float() / n_elem)
+    )
+    t = torch.arange(seq_len, device=freqs.device)
+    freqs = torch.outer(t, freqs)
+    freqs_cis = torch.polar(torch.ones_like(freqs), freqs)
+    cache = torch.stack([freqs_cis.real, freqs_cis.imag], dim=-1)
+    return cache.to(dtype=dtype)
+def apply_rotary_emb(x: Tensor, freqs_cis: Tensor) -> Tensor:
+    xshaped = x.float().reshape(*x.shape[:-1], -1, 2)
+    freqs_cis = freqs_cis.view(1, xshaped.size(1), 1, xshaped.size(3), 2)
+    x_out2 = torch.stack(
+        [
+            xshaped[..., 0] * freqs_cis[..., 0] - xshaped[..., 1] * freqs_cis[..., 1],
+            xshaped[..., 1] * freqs_cis[..., 0] + xshaped[..., 0] * freqs_cis[..., 1],
+        ],
+        -1,
+    )
+    x_out2 = x_out2.flatten(3)
+    return x_out2.type_as(x)
+def init_weights(m):
+    if isinstance(m, nn.Conv1d):
+        nn.init.trunc_normal_(m.weight, std=0.02)
+        nn.init.constant_(m.bias, 0)
+def unpad1d(x: torch.Tensor, paddings: tp.Tuple[int, int]):
+    """Remove padding from x, handling properly zero padding. Only for 1d!"""
+    padding_left, padding_right = paddings
+    assert padding_left >= 0 and padding_right >= 0, (padding_left, padding_right)
+    assert (padding_left + padding_right) <= x.shape[-1]
+    end = x.shape[-1] - padding_right
+    return x[..., padding_left:end]
+def get_extra_padding_for_conv1d(
+    x: torch.Tensor, kernel_size: int, stride: int, padding_total: int = 0
+) -> int:
+    """See `pad_for_conv1d`."""
+    length = x.shape[-1]
+    n_frames = (length - kernel_size + padding_total) / stride + 1
+    ideal_length = (math.ceil(n_frames) - 1) * stride + (kernel_size - padding_total)
+    return ideal_length - length
+def pad1d(
+    x: torch.Tensor,
+    paddings: tp.Tuple[int, int],
+    mode: str = "zeros",
+    value: float = 0.0,
+):
+    """Tiny wrapper around F.pad, just to allow for reflect padding on small input.
+    If this is the case, we insert extra 0 padding to the right
+    before the reflection happen.
+    """
+    length = x.shape[-1]
+    padding_left, padding_right = paddings
+    assert padding_left >= 0 and padding_right >= 0, (padding_left, padding_right)
+    if mode == "reflect":
+        max_pad = max(padding_left, padding_right)
+        extra_pad = 0
+        if length <= max_pad:
+            extra_pad = max_pad - length + 1
+            x = F.pad(x, (0, extra_pad))
+        padded = F.pad(x, paddings, mode, value)
+        end = padded.shape[-1] - extra_pad
+        return padded[..., :end]
+    else:
+        return F.pad(x, paddings, mode, value)
+class CausalConvNet(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        dilation=1,
+        stride=1,
+        groups=1,
+        padding=None,
+    ):
+        super(CausalConvNet, self).__init__()
+        self.conv = nn.Conv1d(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=stride,
+            dilation=dilation,
+            groups=groups,
+        )
+        self.stride = stride
+        self.kernel_size = (kernel_size - 1) * dilation + 1
+        self.dilation = dilation
+        self.padding = self.kernel_size - self.stride
+    def forward(self, x):
+        pad = self.padding
+        extra_padding = get_extra_padding_for_conv1d(
+            x, self.kernel_size, self.stride, pad
+        )
+        x = pad1d(x, (pad, extra_padding), mode="constant", value=0)
+        return self.conv(x).contiguous()
+    def weight_norm(self, name="weight", dim=0):
+        self.conv = weight_norm(self.conv, name=name, dim=dim)
+        return self
+    def remove_weight_norm(self):
+        self.conv = remove_parametrizations(self.conv)
+        return self
+class CausalTransConvNet(nn.Module):
+    def __init__(
+        self, in_channels, out_channels, kernel_size, dilation=1, stride=1, padding=None
+    ):
+        super(CausalTransConvNet, self).__init__()
+        self.conv = nn.ConvTranspose1d(
+            in_channels, out_channels, kernel_size, stride=stride, dilation=dilation
+        )
+        self.stride = stride
+        self.kernel_size = kernel_size
+    def forward(self, x):
+        x = self.conv(x)
+        pad = self.kernel_size - self.stride
+        padding_right = math.ceil(pad)
+        padding_left = pad - padding_right
+        x = unpad1d(x, (padding_left, padding_right))
+        return x.contiguous()
+    def weight_norm(self, name="weight", dim=0):
+        self.conv = weight_norm(self.conv, name=name, dim=dim)
+        return self
+    def remove_weight_norm(self):
+        self.conv = remove_parametrizations(self.conv)
+        return self
+def CausalWNConv1d(*args, **kwargs):
+    return CausalConvNet(*args, **kwargs).weight_norm()
+def CausalWNConvTranspose1d(*args, **kwargs):
+    return CausalTransConvNet(*args, **kwargs).weight_norm()
+class ResidualUnit(nn.Module):
+    def __init__(self, dim: int = 16, dilation: int = 1, causal: bool = False):
+        super().__init__()
+        conv_class = CausalWNConv1d if causal else WNConv1d
+        pad = ((7 - 1) * dilation) // 2
+        self.block = nn.Sequential(
+            Snake1d(dim),
+            conv_class(dim, dim, kernel_size=7, dilation=dilation, padding=pad),
+            Snake1d(dim),
+            conv_class(dim, dim, kernel_size=1),
+        )
+        self.causal = causal
+    def forward(self, x):
+        y = self.block(x)
+        pad = x.shape[-1] - y.shape[-1]
+        if pad > 0:
+            if self.causal:
+                x = x[..., :-pad]
+            else:
+                x = x[..., pad // 2 : -pad // 2]
+        return x + y
+class EncoderBlock(nn.Module):
+    def __init__(
+        self,
+        dim: int = 16,
+        stride: int = 1,
+        causal: bool = False,
+        n_t_layer: int = 0,
+        transformer_general_config=None,
+    ):
+        super().__init__()
+        conv_class = CausalWNConv1d if causal else WNConv1d
+        transformer_module = (
+            nn.Identity()
+            if n_t_layer == 0
+            else (
+                WindowLimitedTransformer(
+                    causal=causal,
+                    input_dim=dim,
+                    window_size=512,
+                    config=transformer_general_config(
+                        n_layer=n_t_layer,
+                        n_head=dim // 64,
+                        dim=dim,
+                        intermediate_size=dim * 3,
+                    ),
+                )
+            )
+        )
+        self.block = nn.Sequential(
+            ResidualUnit(dim // 2, dilation=1, causal=causal),
+            ResidualUnit(dim // 2, dilation=3, causal=causal),
+            ResidualUnit(dim // 2, dilation=9, causal=causal),
+            Snake1d(dim // 2),
+            conv_class(
+                dim // 2,
+                dim,
+                kernel_size=2 * stride,
+                stride=stride,
+                padding=math.ceil(stride / 2),
+            ),
+            transformer_module,
+        )
+    def forward(self, x):
+        return self.block(x)
+class Encoder(nn.Module):
+    def __init__(
+        self,
+        d_model: int = 64,
+        strides: list = [2, 4, 8, 8],
+        d_latent: int = 64,
+        n_transformer_layers: list = [0, 0, 4, 4],
+        transformer_general_config: ModelArgs = None,
+        causal: bool = False,
+    ):
+        super().__init__()
+        conv_class = CausalWNConv1d if causal else WNConv1d
+        # Create first convolution
+        self.block = [conv_class(1, d_model, kernel_size=7, padding=3)]
+        # Create EncoderBlocks that double channels as they downsample by `stride`
+        for stride, n_t_layer in zip(strides, n_transformer_layers):
+            d_model *= 2
+            self.block += [
+                EncoderBlock(
+                    d_model,
+                    stride=stride,
+                    causal=causal,
+                    n_t_layer=n_t_layer,
+                    transformer_general_config=transformer_general_config,
+                )
+            ]
+        # Create last convolution
+        self.block += [
+            Snake1d(d_model),
+            conv_class(d_model, d_latent, kernel_size=3, padding=1),
+        ]
+        # Wrap black into nn.Sequential
+        self.block = nn.Sequential(*self.block)
+        self.enc_dim = d_model
+    def forward(self, x):
+        return self.block(x)
+class DecoderBlock(nn.Module):
+    def __init__(
+        self,
+        input_dim: int = 16,
+        output_dim: int = 8,
+        stride: int = 1,
+        causal: bool = False,
+        n_t_layer: int = 0,
+        transformer_general_config=None,
+    ):
+        super().__init__()
+        conv_trans_class = CausalWNConvTranspose1d if causal else WNConvTranspose1d
+        transformer_module = (
+            nn.Identity()
+            if n_t_layer == 0
+            else (
+                WindowLimitedTransformer(
+                    causal=causal,
+                    input_dim=input_dim,
+                    window_size=None,
+                    config=transformer_general_config(
+                        n_layer=n_t_layer,
+                        n_head=input_dim // 64,
+                        dim=input_dim,
+                        intermediate_size=input_dim * 3,
+                    ),
+                )
+            )
+        )
+        self.block = nn.Sequential(
+            # transformer_module,
+            Snake1d(input_dim),
+            conv_trans_class(
+                input_dim,
+                output_dim,
+                kernel_size=2 * stride,
+                stride=stride,
+                padding=math.ceil(stride / 2),
+            ),
+            ResidualUnit(output_dim, dilation=1, causal=causal),
+            ResidualUnit(output_dim, dilation=3, causal=causal),
+            ResidualUnit(output_dim, dilation=9, causal=causal),
+        )
+    def forward(self, x):
+        return self.block(x)
+class Decoder(nn.Module):
+    def __init__(
+        self,
+        input_channel,
+        channels,
+        rates,
+        d_out: int = 1,
+        causal: bool = False,
+        n_transformer_layers: list = [0, 0, 0, 0],
+        transformer_general_config=None,
+    ):
+        super().__init__()
+        conv_class = CausalWNConv1d if causal else WNConv1d
+        # Add first conv layer
+        layers = [conv_class(input_channel, channels, kernel_size=7, padding=3)]
+        # Add upsampling + MRF blocks
+        for i, (stride, n_t_layer) in enumerate(zip(rates, n_transformer_layers)):
+            input_dim = channels // 2**i
+            output_dim = channels // 2 ** (i + 1)
+            layers += [
+                DecoderBlock(
+                    input_dim,
+                    output_dim,
+                    stride,
+                    causal=causal,
+                    n_t_layer=n_t_layer,
+                    transformer_general_config=transformer_general_config,
+                )
+            ]
+        # Add final conv layer
+        layers += [
+            Snake1d(output_dim),
+            conv_class(output_dim, d_out, kernel_size=7, padding=3),
+            nn.Tanh(),
+        ]
+        self.model = nn.Sequential(*layers)
+    def forward(self, x):
+        return self.model(x)
+class DAC(BaseModel, CodecMixin):
+    def __init__(
+        self,
+        encoder_dim: int = 64,
+        encoder_rates: List[int] = [2, 4, 8, 8],
+        latent_dim: int = None,
+        decoder_dim: int = 1536,
+        decoder_rates: List[int] = [8, 8, 4, 2],
+        quantizer: torch.nn.Module = None,
+        sample_rate: int = 44100,
+        causal: bool = True,
+        encoder_transformer_layers: List[int] = [0, 0, 0, 0],
+        decoder_transformer_layers: List[int] = [0, 0, 0, 0],
+        transformer_general_config=None,
+    ):
+        super().__init__()
+        self.encoder_dim = encoder_dim
+        self.encoder_rates = encoder_rates
+        self.decoder_dim = decoder_dim
+        self.decoder_rates = decoder_rates
+        self.sample_rate = sample_rate
+        if latent_dim is None:
+            latent_dim = encoder_dim * (2 ** len(encoder_rates))
+        self.latent_dim = latent_dim
+        self.hop_length = np.prod(encoder_rates)
+        self.encoder = Encoder(
+            encoder_dim,
+            encoder_rates,
+            latent_dim,
+            causal=causal,
+            n_transformer_layers=encoder_transformer_layers,
+            transformer_general_config=transformer_general_config,
+        )
+        self.quantizer = quantizer
+        self.decoder = Decoder(
+            latent_dim,
+            decoder_dim,
+            decoder_rates,
+            causal=causal,
+            n_transformer_layers=decoder_transformer_layers,
+            transformer_general_config=transformer_general_config,
+        )
+        self.sample_rate = sample_rate
+        self.apply(init_weights)
+        self.delay = self.get_delay()
+        self.frame_length = self.hop_length * 4
+    def preprocess(self, audio_data, sample_rate):
+        if sample_rate is None:
+            sample_rate = self.sample_rate
+        assert sample_rate == self.sample_rate
+        length = audio_data.shape[-1]
+        right_pad = math.ceil(length / self.hop_length) * self.hop_length - length
+        audio_data = nn.functional.pad(audio_data, (0, right_pad))
+        return audio_data
+    def encode(
+        self,
+        audio_data: torch.Tensor,
+        audio_lengths: torch.Tensor = None,
+        n_quantizers: int = None,
+        **kwargs,
+    ):
+        """Encode given audio data and return quantized latent codes
+        Parameters
+        ----------
+        audio_data : Tensor[B x T]
+            Audio data to encode
+        n_quantizers : int, optional
+            Number of quantizers to use, by default None
+            If None, all quantizers are used.
+        Returns
+        -------
+        dict
+            A dictionary with the following keys:
+            "z" : Tensor[B x D x T]
+                Quantized continuous representation of input
+            "codes" : Tensor[B x N x T]
+                Codebook indices for each codebook
+                (quantized discrete representation of input)
+            "latents" : Tensor[B x N*D x T]
+                Projected latents (continuous representation of input before quantization)
+            "vq/commitment_loss" : Tensor[1]
+                Commitment loss to train encoder to predict vectors closer to codebook
+                entries
+            "vq/codebook_loss" : Tensor[1]
+                Codebook loss to update the codebook
+            "length" : int
+                Number of samples in input audio
+        """
+        # pad to multiple of self.frame_length
+        if audio_data.ndim == 2:
+            audio_data = audio_data.unsqueeze(1)
+        # print(audio_data.shape)
+        length = audio_data.shape[-1]
+        right_pad = math.ceil(length / self.frame_length) * self.frame_length - length
+        audio_data = nn.functional.pad(audio_data, (0, right_pad))
+        if audio_lengths is None:
+            audio_lengths = torch.LongTensor([length + right_pad]).to(audio_data.device)
+        z = self.encoder(audio_data)
+        vq_results = self.quantizer(z, n_quantizers, **kwargs)
+        indices = vq_results.codes
+        indices_lens = torch.ceil(audio_lengths / self.frame_length).long()
+        return indices, indices_lens
+    def decode(self, indices: torch.Tensor, feature_lengths):
+        if indices.ndim == 2:
+            indices = indices[None]
+        z = self.quantizer.decode(indices)
+        audio_lengths = feature_lengths * self.frame_length
+        return self.decoder(z), audio_lengths
+    def forward(
+        self,
+        audio_data: torch.Tensor,
+        template: torch.Tensor = None,
+        mask: torch.Tensor = None,
+        sample_rate: int = None,
+        n_quantizers: int = None,
+        **kwargs,
+    ):
+        """Model forward pass
+        Parameters
+        ----------
+        audio_data : Tensor[B x 1 x T]
+            Audio data to encode
+        sample_rate : int, optional
+            Sample rate of audio data in Hz, by default None
+            If None, defaults to `self.sample_rate`
+        n_quantizers : int, optional
+            Number of quantizers to use, by default None.
+            If None, all quantizers are used.
+        Returns
+        -------
+        dict
+            A dictionary with the following keys:
+            "z" : Tensor[B x D x T]
+                Quantized continuous representation of input
+            "codes" : Tensor[B x N x T]
+                Codebook indices for each codebook
+                (quantized discrete representation of input)
+            "latents" : Tensor[B x N*D x T]
+                Projected latents (continuous representation of input before quantization)
+            "vq/commitment_loss" : Tensor[1]
+                Commitment loss to train encoder to predict vectors closer to codebook
+                entries
+            "vq/codebook_loss" : Tensor[1]
+                Codebook loss to update the codebook
+            "length" : int
+                Number of samples in input audio
+            "audio" : Tensor[B x 1 x length]
+                Decoded audio data.
+        """
+        length = audio_data.shape[-1]
+        audio_data = self.preprocess(audio_data, sample_rate)
+        vq_results = self.encode(audio_data, n_quantizers, **kwargs)
+        z = vq_results[0] if isinstance(vq_results, tuple) else vq_results.z
+        x = self.decode(z)
+        return x[..., :length], vq_results
+if __name__ == "__main__":
+    def filter_state_dict_shapes(params, model):
+        model_state_dict = model.state_dict()
+        filtered_state_dict = {
+            k: v
+            for k, v in params.items()
+            if k in model_state_dict and v.shape == model_state_dict[k].shape
+        }
+        skipped_keys = set(params.keys()) - set(filtered_state_dict.keys())
+        if skipped_keys:
+            print(
+                f"Warning: Skipped loading some keys due to shape mismatch: {skipped_keys}"
+            )
+        return filtered_state_dict, skipped_keys
+    model = hydra.utils.instantiate(
+        OmegaConf.load("fish_speech/configs/modded_dac_vq.yaml")
+    )
+    sd = torch.load("checkpoints/openaudio-s1-mini/firefly-gan-large.pth")
+    filtered_sd, skipped_keys = filter_state_dict_shapes(sd, model)
+    print(f"Skipped keys: {skipped_keys}")
+    model.load_state_dict(filtered_sd, strict=False)
+    model.eval()
+    src_audio_path = "./test.wav"
+    wave_np, _ = librosa.load(src_audio_path, sr=44100, mono=False)
+    if len(wave_np.shape) == 1:
+        wave_np = wave_np[None, :]
+    wave_tensor = torch.from_numpy(wave_np).unsqueeze(1)
+    with torch.no_grad():
+        # encode 返回 (indices, indices_lens)
+        indices, indices_lens = model.encode(wave_tensor)
+        print(f"Indices shape: {indices.shape}")
+        print(f"Indices lengths: {indices_lens}")
+        # decode 需要 indices 和 feature_lengths 两个参数
+        fake_audio, audio_lengths = model.decode(indices, indices_lens)
+        print(f"Decoded audio shape: {fake_audio.shape}")
+        print(f"Audio lengths: {audio_lengths}")
+    # 保存重建的音频
+    sf.write("fake.wav", fake_audio.squeeze(1).cpu().numpy().T, 44100)

fish_speech/models/dac/rvq.py ADDED Viewed

	@@ -0,0 +1,403 @@

+import math
+import typing as tp
+from dataclasses import dataclass
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from dac.nn.quantize import ResidualVectorQuantize
+from torch.nn.utils.parametrizations import weight_norm
+from torch.nn.utils.parametrize import remove_parametrizations
+def unpad1d(x: torch.Tensor, paddings: tp.Tuple[int, int]):
+    """Remove padding from x, handling properly zero padding. Only for 1d!"""
+    padding_left, padding_right = paddings
+    assert padding_left >= 0 and padding_right >= 0, (padding_left, padding_right)
+    assert (padding_left + padding_right) <= x.shape[-1]
+    end = x.shape[-1] - padding_right
+    return x[..., padding_left:end]
+def get_extra_padding_for_conv1d(
+    x: torch.Tensor, kernel_size: int, stride: int, padding_total: int = 0
+) -> int:
+    """See `pad_for_conv1d`."""
+    length = x.shape[-1]
+    n_frames = (length - kernel_size + padding_total) / stride + 1
+    ideal_length = (math.ceil(n_frames) - 1) * stride + (kernel_size - padding_total)
+    return ideal_length - length
+def pad1d(
+    x: torch.Tensor,
+    paddings: tp.Tuple[int, int],
+    mode: str = "zeros",
+    value: float = 0.0,
+):
+    """Tiny wrapper around F.pad, just to allow for reflect padding on small input.
+    If this is the case, we insert extra 0 padding to the right
+    before the reflection happen.
+    """
+    length = x.shape[-1]
+    padding_left, padding_right = paddings
+    assert padding_left >= 0 and padding_right >= 0, (padding_left, padding_right)
+    if mode == "reflect":
+        max_pad = max(padding_left, padding_right)
+        extra_pad = 0
+        if length <= max_pad:
+            extra_pad = max_pad - length + 1
+            x = F.pad(x, (0, extra_pad))
+        padded = F.pad(x, paddings, mode, value)
+        end = padded.shape[-1] - extra_pad
+        return padded[..., :end]
+    else:
+        return F.pad(x, paddings, mode, value)
+class CausalConvNet(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        dilation=1,
+        stride=1,
+        groups=1,
+        padding=None,
+    ):
+        super(CausalConvNet, self).__init__()
+        self.conv = nn.Conv1d(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=stride,
+            dilation=dilation,
+            groups=groups,
+        )
+        self.stride = stride
+        self.kernel_size = (kernel_size - 1) * dilation + 1
+        self.dilation = dilation
+        self.padding = self.kernel_size - self.stride
+    def forward(self, x):
+        pad = self.padding
+        extra_padding = get_extra_padding_for_conv1d(
+            x, self.kernel_size, self.stride, pad
+        )
+        x = pad1d(x, (pad, extra_padding), mode="constant", value=0)
+        return self.conv(x).contiguous()
+    def weight_norm(self, name="weight", dim=0):
+        self.conv = weight_norm(self.conv, name=name, dim=dim)
+        return self
+    def remove_weight_norm(self):
+        self.conv = remove_parametrizations(self.conv)
+        return self
+class CausalTransConvNet(nn.Module):
+    def __init__(
+        self, in_channels, out_channels, kernel_size, dilation=1, stride=1, padding=None
+    ):
+        super(CausalTransConvNet, self).__init__()
+        self.conv = nn.ConvTranspose1d(
+            in_channels, out_channels, kernel_size, stride=stride, dilation=dilation
+        )
+        self.stride = stride
+        self.kernel_size = kernel_size
+    def forward(self, x):
+        x = self.conv(x)
+        pad = self.kernel_size - self.stride
+        padding_right = math.ceil(pad)
+        padding_left = pad - padding_right
+        x = unpad1d(x, (padding_left, padding_right))
+        return x.contiguous()
+    def weight_norm(self, name="weight", dim=0):
+        self.conv = weight_norm(self.conv, name=name, dim=dim)
+        return self
+    def remove_weight_norm(self):
+        self.conv = remove_parametrizations(self.conv)
+        return self
+# ConvNeXt Block copied from https://github.com/fishaudio/fish-diffusion/blob/main/fish_diffusion/modules/convnext.py
+class ConvNeXtBlock(nn.Module):
+    r"""ConvNeXt Block. There are two equivalent implementations:
+    (1) DwConv -> LayerNorm (channels_first) -> 1x1 Conv -> GELU -> 1x1 Conv; all in (N, C, H, W)
+    (2) DwConv -> Permute to (N, H, W, C); LayerNorm (channels_last) -> Linear -> GELU -> Linear; Permute back
+    We use (2) as we find it slightly faster in PyTorch
+    Args:
+        dim (int): Number of input channels.
+        drop_path (float): Stochastic depth rate. Default: 0.0
+        layer_scale_init_value (float): Init value for Layer Scale. Default: 1e-6.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.0.
+        kernel_size (int): Kernel size for depthwise conv. Default: 7.
+        dilation (int): Dilation for depthwise conv. Default: 1.
+    """  # noqa: E501
+    def __init__(
+        self,
+        dim: int,
+        layer_scale_init_value: float = 1e-6,
+        mlp_ratio: float = 4.0,
+        kernel_size: int = 7,
+        dilation: int = 1,
+    ):
+        super().__init__()
+        convnet_type = CausalConvNet
+        self.dwconv = convnet_type(
+            dim,
+            dim,
+            kernel_size=kernel_size,
+            # padding=int(dilation * (kernel_size - 1) / 2),
+            groups=dim,
+            dilation=dilation,
+        )  # depthwise conv
+        self.norm = nn.LayerNorm(dim, eps=1e-6)
+        self.pwconv1 = nn.Linear(
+            dim, int(mlp_ratio * dim)
+        )  # pointwise/1x1 convs, implemented with linear layers
+        self.act = nn.GELU()
+        self.pwconv2 = nn.Linear(int(mlp_ratio * dim), dim)
+        self.gamma = (
+            nn.Parameter(layer_scale_init_value * torch.ones((dim)), requires_grad=True)
+            if layer_scale_init_value > 0
+            else None
+        )
+    def forward(self, x, apply_residual: bool = True):
+        input = x
+        x = self.dwconv(x)
+        x = x.permute(0, 2, 1)  # (N, C, L) -> (N, L, C)
+        x = self.norm(x)
+        x = self.pwconv1(x)
+        x = self.act(x)
+        x = self.pwconv2(x)
+        if self.gamma is not None:
+            x = self.gamma * x
+        x = x.permute(0, 2, 1)  # (N, L, C) -> (N, C, L)
+        if apply_residual:
+            x = input + x
+        return x
+@dataclass
+class VQResult:
+    z: torch.Tensor
+    codes: torch.Tensor
+    latents: torch.Tensor
+    codebook_loss: torch.Tensor
+    commitment_loss: torch.Tensor
+    semantic_distill_z: torch.Tensor | None = None
+class DownsampleResidualVectorQuantize(nn.Module):
+    def __init__(
+        self,
+        input_dim: int = 1024,
+        n_codebooks: int = 9,
+        codebook_dim: int = 8,
+        quantizer_dropout: float = 0.5,
+        codebook_size: int = 1024,
+        semantic_codebook_size: int = 4096,
+        downsample_factor: tuple[int] = (2, 2),
+        downsample_dims: tuple[int] | None = None,
+        pre_module: nn.Module | None = None,
+        post_module: nn.Module | None = None,
+        semantic_predictor_module: nn.Module | None = None,
+    ):
+        super().__init__()
+        if downsample_dims is None:
+            downsample_dims = [input_dim for _ in range(len(downsample_factor))]
+        all_dims = (input_dim,) + tuple(downsample_dims)
+        self.semantic_quantizer = ResidualVectorQuantize(
+            input_dim=input_dim,
+            n_codebooks=1,
+            codebook_size=semantic_codebook_size,
+            codebook_dim=codebook_dim,
+            quantizer_dropout=0.0,
+        )
+        self.quantizer = ResidualVectorQuantize(
+            input_dim=input_dim,
+            n_codebooks=n_codebooks,
+            codebook_size=codebook_size,
+            codebook_dim=codebook_dim,
+            quantizer_dropout=quantizer_dropout,
+        )
+        self.downsample_factor = downsample_factor
+        self.downsample_dims = downsample_dims
+        convnet_type = CausalConvNet
+        transconvnet_type = CausalTransConvNet
+        self.downsample = nn.Sequential(
+            *[
+                nn.Sequential(
+                    convnet_type(
+                        all_dims[idx],
+                        all_dims[idx + 1],
+                        kernel_size=factor,
+                        stride=factor,
+                    ),
+                    ConvNeXtBlock(dim=all_dims[idx + 1]),
+                )
+                for idx, factor in enumerate(downsample_factor)
+            ]
+        )
+        self.upsample = nn.Sequential(
+            *[
+                nn.Sequential(
+                    transconvnet_type(
+                        all_dims[idx + 1],
+                        all_dims[idx],
+                        kernel_size=factor,
+                        stride=factor,
+                    ),
+                    ConvNeXtBlock(dim=all_dims[idx]),
+                )
+                for idx, factor in reversed(list(enumerate(downsample_factor)))
+            ]
+        )
+        self.apply(self._init_weights)
+        self.pre_module = (
+            pre_module if pre_module is not None else nn.Identity()
+        )  # leave for transformer, LSTM or Mamba or something else
+        self.post_module = post_module if post_module is not None else nn.Identity()
+        self.semantic_predictor_module = (
+            semantic_predictor_module
+            if semantic_predictor_module is not None
+            else nn.Identity()
+        )
+    def _init_weights(self, m):
+        if isinstance(m, (nn.Conv1d, nn.Linear)):
+            nn.init.trunc_normal_(m.weight, std=0.02)
+            nn.init.constant_(m.bias, 0)
+    def forward(
+        self, z, n_quantizers: int = None, semantic_len: torch.Tensor = None, **kwargs
+    ):
+        # z: (B, D, T)
+        original_shape = z.shape
+        if semantic_len is None:
+            semantic_len = torch.LongTensor([z.shape[-1]])
+        z = self.downsample(z)
+        z = self.pre_module(z)  # B, T, D
+        (
+            semantic_z,
+            semantic_codes,
+            semantic_latents,
+            semantic_commitment_loss,
+            semantic_codebook_loss,
+        ) = self.semantic_quantizer(z)
+        residual_z = z - semantic_z
+        residual_z, codes, latents, commitment_loss, codebook_loss = self.quantizer(
+            residual_z, n_quantizers=n_quantizers
+        )
+        z = semantic_z + residual_z
+        commitment_loss = commitment_loss + semantic_commitment_loss
+        codebook_loss = codebook_loss + semantic_codebook_loss
+        codes = torch.cat([semantic_codes, codes], dim=1)
+        latents = torch.cat([semantic_latents, latents], dim=1)
+        z = self.post_module(z)
+        z = self.upsample(z)
+        # z: (B, D, T)
+        # semantic distillation (disabled here since only used in training)
+        # semantic_distill_z = self.semantic_predictor_module(semantic_z, semantic_len).mT  # wav2vec target is B, T, D
+        # Pad or crop z to match original shape
+        diff = original_shape[-1] - z.shape[-1]
+        right = 0
+        left = abs(diff) - right
+        if diff > 0:
+            z = F.pad(z, (left, right))
+        elif diff < 0:
+            z = z[..., left:]
+        results = VQResult(
+            z=z,
+            codes=codes,
+            latents=latents,
+            commitment_loss=commitment_loss,
+            codebook_loss=codebook_loss,
+        )
+        return results
+    # def encode(self, z):
+    #     z = self.downsample(z)
+    #     z = self.pre_module(z)
+    #     _, indices, _, _, _ = self.quantizer(z.mT)
+    #     indices = rearrange(indices, "g b l r -> b (g r) l")
+    #     return indices
+    #
+    def decode(self, indices: torch.Tensor):
+        # indices = rearrange(indices, "b (g r) l -> g b l r", g=self.residual_fsq.groups)
+        # print(f"indices: {indices.shape}, semantic_quantizer.codebook_size: {self.semantic_quantizer.codebook_size}, quantizer.codebook_size: {self.quantizer.codebook_size}, semantic min: {indices[:, 0].min()}, max: {indices[:, 0].max()}, quantizer min: {indices[:, 1:].min()}, max: {indices[:, 1:].max()}")
+        new_indices = torch.zeros_like(indices)
+        new_indices[:, 0] = torch.clamp(
+            indices[:, 0], max=self.semantic_quantizer.codebook_size - 1
+        )
+        new_indices[:, 1:] = torch.clamp(
+            indices[:, 1:], max=self.quantizer.codebook_size - 1
+        )
+        z_q_semantic = self.semantic_quantizer.from_codes(new_indices[:, :1])[0]
+        z_q_residual = self.quantizer.from_codes(new_indices[:, 1:])[0]
+        z_q = z_q_semantic + z_q_residual
+        z_q = self.post_module(z_q)
+        z_q = self.upsample(z_q)
+        return z_q
+    # def from_latents(self, latents: torch.Tensor):
+    #     z_q, z_p, codes = super().from_latents(latents)
+    #     z_q = self.upsample(z_q)
+    #     return z_q, z_p, codes
+if __name__ == "__main__":
+    rvq = DownsampleResidualVectorQuantize(
+        input_dim=512,
+        n_codebooks=8,
+        codebook_dim=8,
+        codebook_size=1024,
+        quantizer_dropout=0.5,
+        downsample_factor=[2, 2],
+    )
+    rvq.eval()
+    x = torch.randn(2, 512, 442)
+    result = rvq(x)
+    print(rvq)
+    print(result.latents.shape, result.codes.shape, result.z.shape)
+    # y = rvq.from_codes(result.codes)
+    # print(y[0].shape)
+    # y = rvq.from_latents(
+    result1 = rvq(x[:, :, :40])
+    print(result1.latents.shape, result1.codes.shape, result1.z.shape)
+    assert torch.allclose(result.z[:, :, :40], result1.z, atol=1e-8)
+    print("Success")

fish_speech/models/text2semantic/inference.py ADDED Viewed

	@@ -0,0 +1,716 @@

+import os
+import queue
+import threading
+import time
+from contextlib import nullcontext
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Literal, Optional, Tuple, Union
+import click
+import numpy as np
+import torch
+import torch._dynamo.config
+import torch._inductor.config
+from loguru import logger
+from tqdm import tqdm
+from transformers import AutoTokenizer
+from fish_speech.content_sequence import (
+    ContentSequence,
+    TextPart,
+    VQPart,
+)
+from fish_speech.models.text2semantic.llama import BaseModelArgs
+from fish_speech.text import clean_text, split_text
+from fish_speech.tokenizer import IM_END_TOKEN, FishTokenizer
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+torch._inductor.config.coordinate_descent_tuning = True
+torch._inductor.config.triton.unique_kernel_names = True
+if hasattr(torch._inductor.config, "fx_graph_cache"):
+    # Experimental feature to reduce compilation times, will be on by default in future
+    torch._inductor.config.fx_graph_cache = True
+from torch.nn.attention import SDPBackend, sdpa_kernel
+from fish_speech.models.text2semantic.llama import (
+    BaseTransformer,
+    DualARTransformer,
+    NaiveTransformer,
+)
+def multinomial_sample_one_no_sync(
+    probs_sort,
+):  # Does multinomial sampling without a cuda synchronization
+    q = torch.empty_like(probs_sort).exponential_(1)
+    return torch.argmax(probs_sort / q, dim=-1, keepdim=True).to(dtype=torch.int)
+def logits_to_probs(
+    logits,
+    previous_tokens: Optional[torch.Tensor] = None,
+    temperature: torch.Tensor = 1.0,
+    top_p: torch.Tensor = 1.0,
+    repetition_penalty: torch.Tensor = 1.0,
+) -> torch.Tensor:
+    # Apply repetition penalty
+    if previous_tokens is not None:
+        previous_tokens = previous_tokens.long()
+        score = torch.gather(logits, dim=0, index=previous_tokens)
+        score = torch.where(
+            score < 0, score * repetition_penalty, score / repetition_penalty
+        )
+        logits.scatter_(dim=0, index=previous_tokens, src=score)
+    # Apply top-p sampling
+    sorted_logits, sorted_indices = torch.sort(logits, descending=True)
+    cum_probs = torch.cumsum(torch.nn.functional.softmax(sorted_logits, dim=-1), dim=-1)
+    sorted_indices_to_remove = cum_probs > top_p
+    sorted_indices_to_remove[0] = False  # keep at least one option
+    indices_to_remove = sorted_indices_to_remove.scatter(
+        dim=0, index=sorted_indices, src=sorted_indices_to_remove
+    )
+    logits = logits.masked_fill(indices_to_remove, -float("Inf"))
+    logits = logits / max(temperature, 1e-5)
+    probs = torch.nn.functional.softmax(logits, dim=-1)
+    return probs
+def sample(
+    logits,
+    previous_tokens: Optional[torch.Tensor] = None,
+    **sampling_kwargs,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    probs = logits_to_probs(
+        logits=logits[0, -1], previous_tokens=previous_tokens, **sampling_kwargs
+    )
+    idx_next = multinomial_sample_one_no_sync(probs)
+    return idx_next, probs
+def decode_one_token_ar(
+    model: DualARTransformer,
+    x: torch.Tensor,
+    input_pos: torch.Tensor,
+    semantic_ids: list,
+    previous_tokens: torch.Tensor = None,
+    **sampling_kwargs,
+) -> torch.Tensor:
+    x = model.forward_generate(x, input_pos)
+    sampling_kwargs_main = sampling_kwargs.copy()
+    # sampling_kwargs_main["temperature"] = 0.1
+    # sampling_kwargs_main["top_p"] = 0.1
+    # sampling_kwargs_main["repetition_penalty"] = 1.0
+    codebooks = [
+        sample(
+            x.logits,
+            previous_tokens=(
+                previous_tokens[0] if previous_tokens is not None else None
+            ),  # Disable repetition penalty for the token codebook
+            **sampling_kwargs_main,
+        )[0]
+    ]
+    hidden_states = x.hidden_states
+    # Cleanup the cache
+    for layer in model.fast_layers:
+        layer.attention.kv_cache.k_cache.fill_(0)
+        layer.attention.kv_cache.v_cache.fill_(0)
+    input_pos = torch.tensor([0], device=hidden_states.device, dtype=torch.long)
+    model.forward_generate_fast(hidden_states, input_pos)
+    a = codebooks[0] - model.tokenizer.semantic_begin_id
+    a[a < 0] = 0
+    hidden_states = model.fast_embeddings(a)
+    codebooks.append(a)
+    for codebook_idx in range(1, model.config.num_codebooks):
+        input_pos = torch.tensor(
+            [codebook_idx], device=hidden_states.device, dtype=torch.long
+        )
+        logits = model.forward_generate_fast(hidden_states, input_pos)
+        chunked_logits = logits[..., :1024]
+        a = sample(
+            chunked_logits,
+            previous_tokens=(
+                previous_tokens[codebook_idx + 1]
+                if previous_tokens is not None
+                else None
+            ),
+            **sampling_kwargs,
+        )[0]
+        hidden_states = model.fast_embeddings(a)
+        codebooks.append(a)
+    codebooks = torch.stack(codebooks, dim=0)
+    # semantic_ids_tensor = torch.tensor(semantic_ids, device=codebooks.device)
+    # codebooks[1:, :] = torch.masked_fill(
+    #     codebooks[1:, :], ~torch.isin(codebooks[:1, :], semantic_ids_tensor), CODEBOOK_PAD_TOKEN_ID
+    # )
+    # print(codebooks)
+    return codebooks
+def decode_n_tokens(
+    model: NaiveTransformer,
+    cur_token: torch.Tensor,
+    input_pos: torch.Tensor,
+    num_new_tokens: int,
+    semantic_ids: list,
+    decode_one_token=decode_one_token_ar,
+    **sampling_kwargs,
+):
+    previous_tokens = torch.zeros(
+        (model.config.num_codebooks + 1, model.config.max_seq_len),
+        dtype=torch.int,
+        device=cur_token.device,
+    )
+    for i in tqdm(range(num_new_tokens)):
+        # We need to get windowed repeat penalty
+        win_size = 16
+        if i < win_size:
+            window = previous_tokens[:, :win_size]
+        else:
+            window = previous_tokens[:, i - win_size : i]
+        with (
+            torch.backends.cuda.sdp_kernel(
+                enable_flash=False, enable_mem_efficient=False, enable_math=True
+            )
+            if torch.cuda.is_available()
+            else nullcontext()
+        ):  # Actually better for Inductor to codegen attention here
+            next_token = decode_one_token(
+                model=model,
+                x=cur_token,
+                input_pos=input_pos,
+                previous_tokens=window,
+                semantic_ids=semantic_ids,
+                **sampling_kwargs,
+            )
+        input_pos += 1
+        cur_token = next_token.view(1, model.config.num_codebooks + 1, -1)
+        previous_tokens[:, i : i + 1] = next_token.view(
+            model.config.num_codebooks + 1, -1
+        )
+        if cur_token[0, 0, -1] == model.tokenizer.get_token_id(IM_END_TOKEN):
+            break
+    return previous_tokens[:, : i + 1]
+@torch.no_grad()
+@torch.inference_mode()
+def generate(
+    *,
+    model: NaiveTransformer,
+    prompt: torch.Tensor,
+    max_new_tokens: int,
+    decode_one_token=decode_one_token_ar,
+    **sampling_kwargs,
+) -> torch.Tensor:
+    """
+    Takes a conditioning sequence (prompt) as input and continues to generate as many tokens as requested.
+    """
+    # create an empty tensor of the expected final shape and fill in the current tokens
+    T = prompt.size(1)
+    # semantic_id = model.tokenizer.convert_tokens_to_ids("<|semantic|>")
+    semantic_ids = [
+        model.tokenizer.get_token_id(f"<|semantic:{i}|>") for i in range(1024)
+    ]
+    if max_new_tokens:
+        if T + max_new_tokens > model.config.max_seq_len:
+            max_new_tokens = model.config.max_seq_len - T
+            logger.info(f"Truncating max_new_tokens to {max_new_tokens}")
+        T_new = T + max_new_tokens
+    else:
+        T_new = model.config.max_seq_len
+        max_new_tokens = T_new - T
+    device, dtype = prompt.device, prompt.dtype
+    codebook_dim = 1 + model.config.num_codebooks
+    # create an empty tensor of the expected final shape and fill in the current tokens
+    empty = torch.empty(
+        (codebook_dim, model.config.max_seq_len), dtype=dtype, device=device
+    )
+    empty[:, :T] = prompt
+    seq = empty
+    input_pos = torch.arange(0, T, device=device)
+    # Use non-accelerated version for now, to avoid compilation overhead
+    prefill_decode = decode_one_token_ar
+    next_token = prefill_decode(
+        model,
+        prompt.view(1, codebook_dim, -1),
+        input_pos,
+        semantic_ids=semantic_ids,
+        **sampling_kwargs,
+    )
+    seq[:, T : T + 1] = next_token
+    input_pos = torch.tensor([T], device=device, dtype=torch.int)
+    x = decode_n_tokens(
+        model,
+        next_token.view(1, codebook_dim, -1),
+        input_pos,
+        max_new_tokens - 1,
+        decode_one_token=decode_one_token,
+        semantic_ids=semantic_ids,
+        **sampling_kwargs,
+    )
+    # x = torch.cat(generated_tokens, dim=1)
+    seq = seq[:, : T + 1 + x.size(1)]
+    seq[:, T + 1 :] = x
+    return seq
+def load_model(checkpoint_path, device, precision, compile=False):
+    model = DualARTransformer.from_pretrained(checkpoint_path, load_weights=True)
+    model = model.to(device=device, dtype=precision)
+    logger.info(f"Restored model from checkpoint")
+    if isinstance(model, DualARTransformer):
+        decode_one_token = decode_one_token_ar
+        logger.info("Using DualARTransformer")
+    else:
+        raise ValueError("Model is not a DualARTransformer")
+    if compile:
+        logger.info("Compiling function...")
+        decode_one_token = torch.compile(
+            decode_one_token,
+            fullgraph=True,
+            backend="inductor" if torch.cuda.is_available() else "aot_eager",
+            mode="reduce-overhead" if torch.cuda.is_available() else None,
+        )
+    return model.eval(), decode_one_token
+@dataclass
+class GenerateResponse:
+    action: Literal["sample", "next"]
+    codes: Optional[torch.Tensor] = None
+    text: Optional[str] = None
+def generate_long(
+    *,
+    model,
+    device: str | torch.device,
+    decode_one_token: callable,
+    text: str,
+    num_samples: int = 1,
+    max_new_tokens: int = 0,
+    top_p: int = 0.8,
+    repetition_penalty: float = 1.1,
+    temperature: float = 0.8,
+    compile: bool = False,
+    iterative_prompt: bool = True,
+    chunk_length: int = 150,
+    prompt_text: Optional[str | list[str]] = None,
+    prompt_tokens: Optional[torch.Tensor | list[torch.Tensor]] = None,
+):
+    assert 0 < top_p <= 1, "top_p must be in (0, 1]"
+    assert 0 < repetition_penalty < 2, "repetition_penalty must be in (0, 2)"
+    assert 0 < temperature < 2, "temperature must be in (0, 2)"
+    use_prompt = prompt_text is not None and prompt_tokens is not None
+    if use_prompt and isinstance(prompt_text, str):
+        prompt_text = [prompt_text]
+        prompt_tokens = [prompt_tokens]
+    assert use_prompt is False or len(prompt_text) == len(
+        prompt_tokens
+    ), "Prompt text and tokens must have the same length"
+    prompt_tokens = [i.cpu() for i in prompt_tokens]
+    model_size = sum(p.numel() for p in model.parameters() if p.requires_grad)
+    tokenizer = model.tokenizer
+    base_content_sequence = ContentSequence(modality="interleave")
+    texts = split_text(text, chunk_length) if iterative_prompt else [text]
+    max_length = model.config.max_seq_len
+    if use_prompt:
+        for t, c in zip(prompt_text, prompt_tokens):
+            base_content_sequence.append(
+                [
+                    TextPart(text=t),
+                    VQPart(codes=c),
+                ],
+                add_end=True,
+            )
+    encoded_prompts = base_content_sequence.encode_for_inference(
+        tokenizer, num_codebooks=model.config.num_codebooks
+    )
+    if encoded_prompts.size(1) > max_length - 2048:
+        raise ValueError(
+            f"Prompt is too long: {encoded_prompts.size(1)} > {max_length - 2048}"
+        )
+    encoded = []
+    for text in texts:
+        content_sequence = ContentSequence(modality=None)
+        content_sequence.append(TextPart(text=text))
+        encoded.append(
+            content_sequence.encode_for_inference(
+                tokenizer, num_codebooks=model.config.num_codebooks
+            )
+        )
+        logger.info(f"Encoded text: {text}")
+    # Move temperature, top_p, repetition_penalty to device
+    # This is important so that changing params doesn't trigger recompile
+    temperature = torch.tensor(temperature, device=device, dtype=torch.float)
+    top_p = torch.tensor(top_p, device=device, dtype=torch.float)
+    repetition_penalty = torch.tensor(
+        repetition_penalty, device=device, dtype=torch.float
+    )
+    for sample_idx in range(num_samples):
+        if torch.cuda.is_available():
+            torch.cuda.synchronize()
+        global_encoded = []
+        seg_idx = 0
+        while seg_idx < len(encoded):
+            logger.info(
+                f"Generating sentence {seg_idx + 1}/{len(encoded)} of sample {sample_idx + 1}/{num_samples}"
+            )
+            seg = encoded[seg_idx]
+            global_encoded.append(seg)
+            # Do not use previous segments to generate current segment for now
+            # lengths = reversed([seg.size(1) for seg in global_encoded])
+            # # Pick last 2000 tokens
+            # count = 0
+            # for i, length in enumerate(lengths):
+            #     count += length
+            #     if count + length > max_length - 2048 - encoded_prompts.size(1):
+            #         break
+            # if i != 0 and i % 2 == 0:
+            #     i -= 1
+            # # Rotate the list, always make sure first segment is included to avoid drift
+            # if i < len(global_encoded) - 2:
+            #     partial_encoded = global_encoded[:2] + global_encoded[-i:]
+            # else:
+            #     partial_encoded = global_encoded
+            # cat_encoded = torch.cat([encoded_prompts, *partial_encoded], dim=1)
+            if len(base_content_sequence.parts) <= 1 and len(global_encoded) >= 2:
+                cat_encoded = torch.cat(
+                    [encoded_prompts, global_encoded[0], global_encoded[1], seg], dim=1
+                )
+            else:
+                cat_encoded = torch.cat([encoded_prompts, seg], dim=1)
+            cat_encoded = cat_encoded.to(device=device)
+            prompt_length = cat_encoded.size(1)
+            t0 = time.perf_counter()
+            y = generate(
+                model=model,
+                prompt=cat_encoded,
+                max_new_tokens=max_new_tokens,
+                decode_one_token=decode_one_token,
+                temperature=temperature,
+                top_p=top_p,
+                repetition_penalty=repetition_penalty,
+            )
+            if sample_idx == 0 and seg_idx == 0 and compile:
+                logger.info(f"Compilation time: {time.perf_counter() - t0:.2f} seconds")
+            if torch.cuda.is_available():
+                torch.cuda.synchronize()
+            t = time.perf_counter() - t0
+            tokens_generated = y.size(1) - prompt_length
+            tokens_sec = tokens_generated / t
+            logger.info(
+                f"Generated {tokens_generated} tokens in {t:.02f} seconds, {tokens_sec:.02f} tokens/sec"
+            )
+            logger.info(
+                f"Bandwidth achieved: {model_size * tokens_sec / 1e9:.02f} GB/s"
+            )
+            if torch.cuda.is_available():
+                logger.info(
+                    f"GPU Memory used: {torch.cuda.max_memory_reserved() / 1e9:.02f} GB"
+                )
+            # Put the generated tokens
+            # since there is <im_end>, we remove last token
+            codes = y[1:, prompt_length:-1].clone()
+            assert (codes >= 0).all(), f"Negative code found"
+            decoded = y[:, prompt_length:].clone()
+            # But for global encoding, we should keep the <im_end> token
+            global_encoded.append(decoded.cpu())
+            assert (codes >= 0).all(), f"Negative code found: {codes}"
+            yield GenerateResponse(action="sample", codes=codes, text=texts[seg_idx])
+            seg_idx += 1
+        # This indicates the end of the current sample
+        yield GenerateResponse(action="next")
+@dataclass
+class WrappedGenerateResponse:
+    status: Literal["success", "error"]
+    response: Optional[GenerateResponse | Exception] = None
+@dataclass
+class GenerateRequest:
+    request: dict
+    response_queue: queue.Queue
+def launch_thread_safe_queue(
+    checkpoint_path,
+    device,
+    precision,
+    compile: bool = False,
+):
+    input_queue = queue.Queue()
+    init_event = threading.Event()
+    def worker():
+        model, decode_one_token = load_model(
+            checkpoint_path, device, precision, compile=compile
+        )
+        with torch.device(device):
+            model.setup_caches(
+                max_batch_size=1,
+                max_seq_len=model.config.max_seq_len,
+                dtype=next(model.parameters()).dtype,
+            )
+        init_event.set()
+        while True:
+            item: GenerateRequest | None = input_queue.get()
+            if item is None:
+                break
+            kwargs = item.request
+            response_queue = item.response_queue
+            try:
+                for chunk in generate_long(
+                    model=model, decode_one_token=decode_one_token, **kwargs
+                ):
+                    response_queue.put(
+                        WrappedGenerateResponse(status="success", response=chunk)
+                    )
+            except Exception as e:
+                response_queue.put(WrappedGenerateResponse(status="error", response=e))
+    threading.Thread(target=worker, daemon=True).start()
+    init_event.wait()
+    return input_queue
+def launch_thread_safe_queue_agent(
+    checkpoint_path,
+    device,
+    precision,
+    compile: bool = False,
+):
+    input_queue = queue.Queue()
+    init_event = threading.Event()
+    tokenizer = AutoTokenizer.from_pretrained(checkpoint_path)
+    config = BaseModelArgs.from_pretrained(checkpoint_path)
+    def worker():
+        model, decode_one_token = load_model(
+            checkpoint_path, device, precision, compile=compile, is_agent=True
+        )
+        with torch.device(device):
+            model.setup_caches(
+                max_batch_size=1,
+                max_seq_len=model.config.max_seq_len,
+                dtype=next(model.parameters()).dtype,
+            )
+        init_event.set()
+        while True:
+            item: GenerateRequest | None = input_queue.get()
+            if item is None:
+                break
+            kwargs = item.request
+            response_queue = item.response_queue
+            try:
+                for token in generate_agent(
+                    model=model,
+                    decode_one_token=decode_one_token,
+                    **kwargs,
+                ):
+                    response_queue.put(token)
+                response_queue.put("stop")
+            except Exception as e:
+                import traceback
+                logger.exception(f"Error in worker: {traceback.format_exc()}")
+                response_queue.put("error")
+    threading.Thread(target=worker, daemon=True).start()
+    init_event.wait()
+    return input_queue, tokenizer, config
+@click.command()
+@click.option(
+    "--text",
+    type=str,
+    default="你说的对, 但是原神是一款由米哈游自主研发的开放世界手游.",
+)
+@click.option("--prompt-text", type=str, default=None, multiple=True)
+@click.option(
+    "--prompt-tokens",
+    type=click.Path(path_type=Path, exists=True),
+    default=None,
+    multiple=True,
+)
+@click.option("--num-samples", type=int, default=1)
+@click.option("--max-new-tokens", type=int, default=0)
+@click.option("--top-p", type=float, default=0.8)
+@click.option("--repetition-penalty", type=float, default=1.1)
+@click.option("--temperature", type=float, default=0.8)
+@click.option(
+    "--checkpoint-path",
+    type=click.Path(path_type=Path, exists=True),
+    default="checkpoints/openaudio-s1-mini",
+)
+@click.option("--device", type=str, default="cuda")
+@click.option("--compile/--no-compile", default=False)
+@click.option("--seed", type=int, default=42)
+@click.option("--half/--no-half", default=False)
+@click.option("--iterative-prompt/--no-iterative-prompt", default=True)
+@click.option("--chunk-length", type=int, default=300)
+@click.option("--output-dir", type=Path, default="temp")
+def main(
+    text: str,
+    prompt_text: Optional[list[str]],
+    prompt_tokens: Optional[list[Path]],
+    num_samples: int,
+    max_new_tokens: int,
+    top_p: int,
+    repetition_penalty: float,
+    temperature: float,
+    checkpoint_path: Path,
+    device: str,
+    compile: bool,
+    seed: int,
+    half: bool,
+    iterative_prompt: bool,
+    chunk_length: int,
+    output_dir: Path,
+) -> None:
+    os.makedirs(output_dir, exist_ok=True)
+    precision = torch.half if half else torch.bfloat16
+    if prompt_text is not None and len(prompt_text) != len(prompt_tokens):
+        raise ValueError(
+            f"Number of prompt text ({len(prompt_text)}) and prompt tokens ({len(prompt_tokens)}) should be the same"
+        )
+    logger.info("Loading model ...")
+    t0 = time.time()
+    model, decode_one_token = load_model(
+        checkpoint_path, device, precision, compile=compile
+    )
+    with torch.device(device):
+        model.setup_caches(
+            max_batch_size=1,
+            max_seq_len=model.config.max_seq_len,
+            dtype=next(model.parameters()).dtype,
+        )
+    if torch.cuda.is_available():
+        torch.cuda.synchronize()
+    logger.info(f"Time to load model: {time.time() - t0:.02f} seconds")
+    if prompt_tokens is not None:
+        prompt_tokens = [torch.from_numpy(np.load(p)) for p in prompt_tokens]
+    torch.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed(seed)
+    generator = generate_long(
+        model=model,
+        device=device,
+        decode_one_token=decode_one_token,
+        text=text,
+        num_samples=num_samples,
+        max_new_tokens=max_new_tokens,
+        top_p=top_p,
+        repetition_penalty=repetition_penalty,
+        temperature=temperature,
+        compile=compile,
+        iterative_prompt=iterative_prompt,
+        chunk_length=chunk_length,
+        prompt_text=prompt_text,
+        prompt_tokens=prompt_tokens,
+    )
+    idx = 0
+    codes = []
+    for response in generator:
+        if response.action == "sample":
+            codes.append(response.codes)
+            logger.info(f"Sampled text: {response.text}")
+        elif response.action == "next":
+            if codes:
+                codes_npy_path = os.path.join(output_dir, f"codes_{idx}.npy")
+                np.save(codes_npy_path, torch.cat(codes, dim=1).cpu().numpy())
+                logger.info(f"Saved codes to {codes_npy_path}")
+            logger.info(f"Next sample")
+            codes = []
+            idx += 1
+        else:
+            logger.error(f"Error: {response}")
+if __name__ == "__main__":
+    main()

fish_speech/models/text2semantic/lit_module.py CHANGED Viewed

@@ -1,202 +1,202 @@
-from typing import Any, Optional
-import lightning as L
-import torch
-import torch.nn.functional as F
-from lightning.pytorch.utilities.types import OptimizerLRScheduler
-import fish_speech.utils as utils
-from fish_speech.conversation import CODEBOOK_PAD_TOKEN_ID
-from fish_speech.models.text2semantic.llama import NaiveTransformer
-log = utils.RankedLogger(__name__, rank_zero_only=True)
-class TextToSemantic(L.LightningModule):
-    def __init__(
-        self,
-        model: NaiveTransformer,
-        optimizer: Any,
-        lr_scheduler: Any,
-    ):
-        super().__init__()
-        self.model = model
-        self.optimizer_builder = optimizer
-        self.lr_scheduler_builder = lr_scheduler
-    def forward(self, x):
-        return self.model(x)
-    def on_save_checkpoint(self, checkpoint):
-        # Save only LoRA parameters
-        state_dict = checkpoint["state_dict"]
-        use_lora = any("lora" in name for name in state_dict.keys())
-        if not use_lora:
-            return
-        for name in list(state_dict.keys()):
-            if "lora" not in name:
-                state_dict.pop(name)
-    def configure_optimizers(self) -> OptimizerLRScheduler:
-        # Get weight decay parameters
-        weight_decay_parameters, other_parameters = [], []
-        for name, param in self.named_parameters():
-            if ".bias" in name or "norm.weight" in name or ".embeddings." in name:
-                other_parameters.append(param)
-            else:
-                weight_decay_parameters.append(param)
-        optimizer = self.optimizer_builder(
-            [
-                {"params": weight_decay_parameters},
-                {"params": other_parameters, "weight_decay": 0.0},
-            ]
-        )
-        # Print the parameters and their weight decay
-        for i in optimizer.param_groups:
-            log.info(
-                f"Set weight decay: {i['weight_decay']} for {len(i['params'])} parameters"
-            )
-        lr_scheduler = self.lr_scheduler_builder(optimizer)
-        return {
-            "optimizer": optimizer,
-            "lr_scheduler": {
-                "scheduler": lr_scheduler,
-                "interval": "step",
-            },
-        }
-    # Copied from https://github.com/eric-mitchell/direct-preference-optimization/blob/main/trainers.py#L90
-    def get_batch_logps(
-        self,
-        logits: torch.FloatTensor,
-        labels: torch.LongTensor,
-        average_log_prob: bool = False,
-    ) -> torch.FloatTensor:
-        """Compute the log probabilities of the given labels under the given logits.
-        Args:
-            logits: Logits of the model (unnormalized). Shape: (batch_size, sequence_length, codebook_size, vocab_size)
-            labels: Labels for which to compute the log probabilities. Label tokens with a value of -100 are ignored. Shape: (batch_size, sequence_length, codebook_size)
-            average_log_prob: If True, return the average log probability per (non-masked) token. Otherwise, return the sum of the log probabilities of the (non-masked) tokens.
-        Returns:
-            A tensor of shape (batch_size,) containing the average/sum log probabilities of the given labels under the given logits.
-        """
-        assert logits.shape[:-1] == labels.shape
-        labels = labels.clone()
-        loss_mask = labels != -100
-        # dummy token; we'll ignore the losses on these tokens later
-        labels[labels == -100] = 0
-        per_token_logps = torch.gather(
-            logits.log_softmax(-1), dim=-1, index=labels.unsqueeze(-1)
-        ).squeeze(-1)
-        if average_log_prob:
-            return (per_token_logps * loss_mask).sum(-1) / loss_mask.sum(-1)
-        else:
-            return (per_token_logps * loss_mask).sum(-1)
-    def _step(self, batch, batch_idx, stage: str):
-        is_train = stage == "train"
-        if is_train:
-            # Key part to make lora work
-            # Otherwise the parameters are merged, which lead to incorrect gradients
-            self.model.train()
-        # Do positive and negative samples in the same batch to speed up training
-        labels = batch["labels"]
-        outputs = self.model(
-            inp=batch["inputs"],
-            key_padding_mask=batch["attention_masks"],
-        )
-        token_logits = outputs.token_logits
-        codebook_logits = outputs.codebook_logits
-        # Generate labels
-        base_loss = F.cross_entropy(
-            token_logits.view(-1, token_logits.size(-1)),
-            labels[:, 0].reshape(-1),
-            ignore_index=-100,
-        )
-        codebook_labels = labels[:, 1 : 1 + self.model.config.num_codebooks].mT
-        semantic_loss = F.cross_entropy(
-            codebook_logits.view(-1, codebook_logits.size(-1)),
-            codebook_labels.reshape(-1),
-            ignore_index=-100,
-        )
-        loss = base_loss + semantic_loss
-        self.log(
-            f"{stage}/loss",
-            loss,
-            on_step=is_train,
-            on_epoch=not is_train,
-            prog_bar=True,
-            logger=True,
-            sync_dist=not is_train,
-        )
-        self.log(
-            f"{stage}/base_loss",
-            base_loss,
-            on_step=is_train,
-            on_epoch=not is_train,
-            prog_bar=False,
-            logger=True,
-            sync_dist=not is_train,
-        )
-        self.log(
-            f"{stage}/semantic_loss",
-            semantic_loss,
-            on_step=is_train,
-            on_epoch=not is_train,
-            prog_bar=False,
-            logger=True,
-            sync_dist=not is_train,
-        )
-        # Top-5 accuracy
-        accuracy = self.get_accuracy(codebook_logits, codebook_labels)
-        self.log(
-            f"{stage}/top_5_accuracy",
-            accuracy,
-            on_step=is_train,
-            on_epoch=not is_train,
-            prog_bar=True,
-            logger=True,
-            sync_dist=not is_train,
-        )
-        return loss
-    def get_accuracy(self, logits, labels):
-        mask = (labels != -100) & (labels != CODEBOOK_PAD_TOKEN_ID)
-        if mask.sum() == 0:
-            return torch.tensor(0.0, device=logits.device)
-        _, indices = logits.topk(5, dim=-1)
-        correct = indices.eq(labels.unsqueeze(-1))
-        correct[~mask] = 0
-        correct = correct.sum()
-        accuracy = correct / mask.sum()
-        return accuracy
-    def training_step(self, batch, batch_idx):
-        return self._step(batch, batch_idx, "train")
-    def validation_step(self, batch, batch_idx):
-        return self._step(batch, batch_idx, "val")

+from typing import Any, Optional
+import lightning as L
+import torch
+import torch.nn.functional as F
+from lightning.pytorch.utilities.types import OptimizerLRScheduler
+import fish_speech.utils as utils
+from fish_speech.conversation import CODEBOOK_PAD_TOKEN_ID
+from fish_speech.models.text2semantic.llama import NaiveTransformer
+log = utils.RankedLogger(__name__, rank_zero_only=True)
+class TextToSemantic(L.LightningModule):
+    def __init__(
+        self,
+        model: NaiveTransformer,
+        optimizer: Any,
+        lr_scheduler: Any,
+    ):
+        super().__init__()
+        self.model = model
+        self.optimizer_builder = optimizer
+        self.lr_scheduler_builder = lr_scheduler
+    def forward(self, x):
+        return self.model(x)
+    def on_save_checkpoint(self, checkpoint):
+        # Save only LoRA parameters
+        state_dict = checkpoint["state_dict"]
+        use_lora = any("lora" in name for name in state_dict.keys())
+        if not use_lora:
+            return
+        for name in list(state_dict.keys()):
+            if "lora" not in name:
+                state_dict.pop(name)
+    def configure_optimizers(self) -> OptimizerLRScheduler:
+        # Get weight decay parameters
+        weight_decay_parameters, other_parameters = [], []
+        for name, param in self.named_parameters():
+            if ".bias" in name or "norm.weight" in name or ".embeddings." in name:
+                other_parameters.append(param)
+            else:
+                weight_decay_parameters.append(param)
+        optimizer = self.optimizer_builder(
+            [
+                {"params": weight_decay_parameters},
+                {"params": other_parameters, "weight_decay": 0.0},
+            ]
+        )
+        # Print the parameters and their weight decay
+        for i in optimizer.param_groups:
+            log.info(
+                f"Set weight decay: {i['weight_decay']} for {len(i['params'])} parameters"
+            )
+        lr_scheduler = self.lr_scheduler_builder(optimizer)
+        return {
+            "optimizer": optimizer,
+            "lr_scheduler": {
+                "scheduler": lr_scheduler,
+                "interval": "step",
+            },
+        }
+    # Copied from https://github.com/eric-mitchell/direct-preference-optimization/blob/main/trainers.py#L90
+    def get_batch_logps(
+        self,
+        logits: torch.FloatTensor,
+        labels: torch.LongTensor,
+        average_log_prob: bool = False,
+    ) -> torch.FloatTensor:
+        """Compute the log probabilities of the given labels under the given logits.
+        Args:
+            logits: Logits of the model (unnormalized). Shape: (batch_size, sequence_length, codebook_size, vocab_size)
+            labels: Labels for which to compute the log probabilities. Label tokens with a value of -100 are ignored. Shape: (batch_size, sequence_length, codebook_size)
+            average_log_prob: If True, return the average log probability per (non-masked) token. Otherwise, return the sum of the log probabilities of the (non-masked) tokens.
+        Returns:
+            A tensor of shape (batch_size,) containing the average/sum log probabilities of the given labels under the given logits.
+        """
+        assert logits.shape[:-1] == labels.shape
+        labels = labels.clone()
+        loss_mask = labels != -100
+        # dummy token; we'll ignore the losses on these tokens later
+        labels[labels == -100] = 0
+        per_token_logps = torch.gather(
+            logits.log_softmax(-1), dim=-1, index=labels.unsqueeze(-1)
+        ).squeeze(-1)
+        if average_log_prob:
+            return (per_token_logps * loss_mask).sum(-1) / loss_mask.sum(-1)
+        else:
+            return (per_token_logps * loss_mask).sum(-1)
+    def _step(self, batch, batch_idx, stage: str):
+        is_train = stage == "train"
+        if is_train:
+            # Key part to make lora work
+            # Otherwise the parameters are merged, which lead to incorrect gradients
+            self.model.train()
+        # Do positive and negative samples in the same batch to speed up training
+        labels = batch["labels"]
+        outputs = self.model(
+            inp=batch["inputs"],
+            key_padding_mask=batch["attention_masks"],
+        )
+        token_logits = outputs.token_logits
+        codebook_logits = outputs.codebook_logits
+        # Generate labels
+        base_loss = F.cross_entropy(
+            token_logits.view(-1, token_logits.size(-1)),
+            labels[:, 0].reshape(-1),
+            ignore_index=-100,
+        )
+        codebook_labels = labels[:, 1 : 1 + self.model.config.num_codebooks].mT
+        semantic_loss = F.cross_entropy(
+            codebook_logits.view(-1, codebook_logits.size(-1)),
+            codebook_labels.reshape(-1),
+            ignore_index=-100,
+        )
+        loss = base_loss + semantic_loss
+        self.log(
+            f"{stage}/loss",
+            loss,
+            on_step=is_train,
+            on_epoch=not is_train,
+            prog_bar=True,
+            logger=True,
+            sync_dist=not is_train,
+        )
+        self.log(
+            f"{stage}/base_loss",
+            base_loss,
+            on_step=is_train,
+            on_epoch=not is_train,
+            prog_bar=False,
+            logger=True,
+            sync_dist=not is_train,
+        )
+        self.log(
+            f"{stage}/semantic_loss",
+            semantic_loss,
+            on_step=is_train,
+            on_epoch=not is_train,
+            prog_bar=False,
+            logger=True,
+            sync_dist=not is_train,
+        )
+        # Top-5 accuracy
+        accuracy = self.get_accuracy(codebook_logits, codebook_labels)
+        self.log(
+            f"{stage}/top_5_accuracy",
+            accuracy,
+            on_step=is_train,
+            on_epoch=not is_train,
+            prog_bar=True,
+            logger=True,
+            sync_dist=not is_train,
+        )
+        return loss
+    def get_accuracy(self, logits, labels):
+        mask = (labels != -100) & (labels != CODEBOOK_PAD_TOKEN_ID)
+        if mask.sum() == 0:
+            return torch.tensor(0.0, device=logits.device)
+        _, indices = logits.topk(5, dim=-1)
+        correct = indices.eq(labels.unsqueeze(-1))
+        correct[~mask] = 0
+        correct = correct.sum()
+        accuracy = correct / mask.sum()
+        return accuracy
+    def training_step(self, batch, batch_idx):
+        return self._step(batch, batch_idx, "train")
+    def validation_step(self, batch, batch_idx):
+        return self._step(batch, batch_idx, "val")

fish_speech/models/text2semantic/llama.py CHANGED Viewed

@@ -1,887 +1,903 @@
-import dataclasses
-import json
-import math
-from collections import OrderedDict
-from dataclasses import dataclass
-from pathlib import Path
-from typing import Optional
-import torch
-import torch.nn as nn
-from einops import rearrange
-from loguru import logger
-from torch import Tensor
-from torch.nn import functional as F
-from torch.nn.attention import SDPBackend, sdpa_kernel
-from torch.utils.checkpoint import checkpoint
-from transformers import AutoTokenizer
-from fish_speech.tokenizer import SEMANTIC_TOKENS, FishTokenizer
-from fish_speech.utils import RankedLogger
-from .lora import LoraConfig, setup_lora
-log = RankedLogger(__name__, rank_zero_only=True)
-def find_multiple(n: int, k: int) -> int:
-    if n % k == 0:
-        return n
-    return n + k - (n % k)
-@dataclass
-class BaseModelArgs:
-    model_type: str = "base"
-    vocab_size: int = 32000
-    n_layer: int = 32
-    n_head: int = 32
-    dim: int = 4096
-    intermediate_size: int = None
-    n_local_heads: int = -1
-    head_dim: int = 64
-    rope_base: float = 10000
-    norm_eps: float = 1e-5
-    max_seq_len: int = 2048
-    dropout: float = 0.0
-    tie_word_embeddings: bool = True
-    attention_qkv_bias: bool = False
-    # Codebook configs
-    codebook_size: int = 160
-    num_codebooks: int = 4
-    # Gradient checkpointing
-    use_gradient_checkpointing: bool = True
-    # Initialize the model
-    initializer_range: float = 0.02
-    # Dummy vars
-    is_reward_model: bool = False
-    share_codebook_embeddings: bool = True
-    scale_codebook_embeddings: bool = False
-    def __post_init__(self):
-        if self.n_local_heads == -1:
-            self.n_local_heads = self.n_head
-        if self.intermediate_size is None:
-            hidden_dim = 4 * self.dim
-            n_hidden = int(2 * hidden_dim / 3)
-            self.intermediate_size = find_multiple(n_hidden, 256)
-        self.head_dim = self.dim // self.n_head
-    @staticmethod
-    def from_pretrained(path: str):
-        path = Path(path)
-        if path.is_dir():
-            path = path / "config.json"
-        with open(path, "r", encoding="utf-8") as f:
-            data = json.load(f)
-        match data["model_type"]:
-            case "naive":
-                cls = NaiveModelArgs
-            case "dual_ar":
-                cls = DualARModelArgs
-            case _:
-                raise ValueError(f"Unknown model type: {data['model_type']}")
-        return cls(**data)
-    def save(self, path: str):
-        with open(path, "w") as f:
-            json.dump(self.__dict__, f, indent=4, sort_keys=True, ensure_ascii=False)
-@dataclass
-class NaiveModelArgs(BaseModelArgs):
-    model_type: str = "naive"
-@dataclass
-class DualARModelArgs(BaseModelArgs):
-    model_type: str = "dual_ar"
-    n_fast_layer: int = 4
-    fast_dim: int | None = None
-    fast_n_head: int | None = None
-    fast_n_local_heads: int | None = None
-    fast_head_dim: int | None = None
-    fast_intermediate_size: int | None = None
-    fast_attention_qkv_bias: bool | None = None
-    def __post_init__(self):
-        super().__post_init__()
-        self.fast_dim = self.fast_dim or self.dim
-        self.fast_n_head = self.fast_n_head or self.n_head
-        self.fast_n_local_heads = self.fast_n_local_heads or self.n_local_heads
-        self.fast_head_dim = self.fast_head_dim or self.head_dim
-        self.fast_intermediate_size = (
-            self.fast_intermediate_size or self.intermediate_size
-        )
-        self.fast_attention_qkv_bias = (
-            self.fast_attention_qkv_bias
-            if self.fast_attention_qkv_bias is not None
-            else self.attention_qkv_bias
-        )
-class KVCache(nn.Module):
-    def __init__(
-        self, max_batch_size, max_seq_len, n_heads, head_dim, dtype=torch.bfloat16
-    ):
-        super().__init__()
-        cache_shape = (max_batch_size, n_heads, max_seq_len, head_dim)
-        self.register_buffer("k_cache", torch.zeros(cache_shape, dtype=dtype))
-        self.register_buffer("v_cache", torch.zeros(cache_shape, dtype=dtype))
-    def update(self, input_pos, k_val, v_val):
-        # input_pos: [S], k_val: [B, H, S, D]
-        assert input_pos.shape[0] == k_val.shape[2]
-        k_out = self.k_cache
-        v_out = self.v_cache
-        k_out[:, :, input_pos] = k_val
-        v_out[:, :, input_pos] = v_val
-        return k_out, v_out
-@dataclass
-class TransformerForwardResult:
-    token_logits: Tensor
-    codebook_logits: Tensor
-@dataclass
-class BaseTransformerForwardResult:
-    logits: Tensor
-    hidden_states: Tensor
-class BaseTransformer(nn.Module):
-    def __init__(
-        self,
-        config: BaseModelArgs,
-        tokenizer: FishTokenizer | AutoTokenizer,
-        init_weights: bool = True,
-    ) -> None:
-        super().__init__()
-        self.config = config
-        self.tokenizer = tokenizer
-        self.semantic_token_ids = [
-            tokenizer.get_token_id(SEMANTIC_TOKEN) for SEMANTIC_TOKEN in SEMANTIC_TOKENS
-        ]
-        # Slow transformer
-        self.embeddings = nn.Embedding(
-            config.vocab_size,
-            config.dim,
-        )
-        self.codebook_embeddings = nn.Embedding(
-            config.codebook_size * config.num_codebooks,
-            config.dim,
-        )
-        self.layers = nn.ModuleList(
-            TransformerBlock(config, use_sdpa=True) for _ in range(config.n_layer)
-        )
-        self.norm = RMSNorm(config.dim, eps=config.norm_eps)
-        if self.config.tie_word_embeddings is False:
-            self.output = nn.Linear(
-                config.dim,
-                config.vocab_size,
-                bias=False,
-            )
-        self.register_buffer(
-            "freqs_cis",
-            precompute_freqs_cis(
-                config.max_seq_len,
-                config.dim // config.n_head,
-                config.rope_base,
-            ),
-            persistent=False,
-        )
-        self.register_buffer(
-            "causal_mask",
-            torch.tril(
-                torch.ones(
-                    config.max_seq_len,
-                    config.max_seq_len,
-                    dtype=torch.bool,
-                )
-            ),
-            persistent=False,
-        )
-        # For kv cache
-        self.max_batch_size = -1
-        self.max_seq_len = -1
-        if init_weights:
-            self.apply(self._init_weights)
-    def setup_caches(
-        self, max_batch_size: int, max_seq_len: int, dtype: torch.dtype = torch.bfloat16
-    ):
-        if self.max_seq_len >= max_seq_len and self.max_batch_size >= max_batch_size:
-            return
-        head_dim = self.config.dim // self.config.n_head
-        max_seq_len = find_multiple(max_seq_len, 8)
-        self.max_seq_len = max_seq_len
-        self.max_batch_size = max_batch_size
-        for b in self.layers:
-            b.attention.kv_cache = KVCache(
-                max_batch_size,
-                max_seq_len,
-                self.config.n_local_heads,
-                head_dim,
-                dtype=dtype,
-            )
-    def embed(self, x: Tensor) -> Tensor:
-        vocab_embeds = [self.embeddings(x[:, 0])]
-        for i in range(self.config.num_codebooks):
-            emb = self.codebook_embeddings(x[:, i + 1] + i * self.config.codebook_size)
-            semantic_token_ids_tensor = torch.tensor(
-                self.semantic_token_ids, device=x.device
-            )
-            emb[~torch.isin(x[:, 0], semantic_token_ids_tensor)] = 0
-        x = torch.stack(vocab_embeds, dim=3)
-        x = x.sum(dim=3)
-        return x
-    def forward(
-        self,
-        inp: Tensor,
-        key_padding_mask: Optional[Tensor] = None,
-    ) -> BaseTransformerForwardResult:
-        seq_len = inp.size(2)
-        # Here we want to merge the embeddings of the codebooks
-        x = self.embed(inp)
-        freqs_cis = self.freqs_cis[:seq_len]
-        # Not that the causal mask here follows the definition of scaled_dot_product_attention
-        # That is, FALSE means masked out
-        # To maintain consistency, key_padding_mask use TRUE to mask out
-        mask = None
-        if key_padding_mask is not None:
-            mask = self.causal_mask[None, None, :seq_len, :seq_len]  # (B, N, Q, K)
-            mask = mask & key_padding_mask[:, None, None, :].logical_not()
-        for layer in self.layers:
-            if self.config.use_gradient_checkpointing and self.training:
-                x = checkpoint(layer, x, freqs_cis, mask, use_reentrant=True)
-            else:
-                x = layer(x, freqs_cis, mask)
-        # We got slow_out here
-        slow_out = self.norm(x)
-        if self.config.tie_word_embeddings:
-            token_logits = F.linear(slow_out, self.embeddings.weight)
-        else:
-            token_logits = self.output(slow_out)
-        return BaseTransformerForwardResult(
-            logits=token_logits,
-            hidden_states=x,
-        )
-    def forward_generate(
-        self,
-        inp: Tensor,
-        input_pos: Optional[Tensor] = None,
-        vq_masks: Optional[Tensor] = None,  # this is not used in fact
-        return_all: bool = False,
-    ) -> BaseTransformerForwardResult:
-        # This is used for generation, optimized for torch compile
-        # assert (
-        #     self.max_seq_len != -1 and self.max_batch_size != -1
-        # ), "Please call setup_caches before forward_generate"
-        embeds = []
-        for i in range(self.config.num_codebooks):
-            if self.config.share_codebook_embeddings:
-                _tokens = inp[:, i + 1] + i * self.config.codebook_size
-            else:
-                _tokens = inp[:, i + 1]
-            emb = self.codebook_embeddings(_tokens)
-            embeds.append(emb)
-        vq_embeds_sum = torch.stack(embeds, dim=1).sum(dim=1)
-        # if self.config.use_codebook_mlp:
-        #     vq_embeds_sum = vq_embeds_sum / self.config.num_codebooks
-        #     vq_embeds_sum = self.codebook_mlp(vq_embeds_sum)
-        vq_masks = (inp[:, 0] >= self.tokenizer.semantic_begin_id) & (
-            inp[:, 0] <= self.tokenizer.semantic_end_id
-        )
-        vq_embeds_sum[~vq_masks] = 0
-        x = self.embeddings(inp[:, 0]) + vq_embeds_sum
-        if input_pos is None:
-            input_pos = torch.arange(inp.shape[-1], device=x.device)
-            max_seq_len = inp.shape[-1]
-        else:
-            max_seq_len = self.max_seq_len
-        mask = self.causal_mask[None, None, input_pos, :max_seq_len]  # (B, N, Q, K)
-        freqs_cis = self.freqs_cis[input_pos]
-        for layer in self.layers:
-            x = layer(x, freqs_cis, mask, input_pos=input_pos)
-        # If prefill, we only calculate the logits of last token
-        if x.size(1) > 1 and not return_all:
-            x = x[:, -1:]
-        # We got slow_out here
-        slow_out = self.norm(x)
-        if self.config.is_reward_model:
-            token_logits = self.score_output(slow_out)
-        elif self.config.tie_word_embeddings:
-            token_logits = F.linear(slow_out, self.embeddings.weight)
-        else:
-            token_logits = self.output(slow_out)
-        return BaseTransformerForwardResult(
-            logits=token_logits,
-            hidden_states=x,
-        )
-    def _init_weights(self, module):
-        std = self.config.initializer_range
-        if isinstance(module, nn.Linear):
-            module.weight.data.normal_(mean=0.0, std=std)
-            if module.bias is not None:
-                module.bias.data.zero_()
-        elif isinstance(module, nn.Embedding):
-            module.weight.data.normal_(mean=0.0, std=std)
-            if module.padding_idx is not None:
-                module.weight.data[module.padding_idx].zero_()
-    @staticmethod
-    def from_pretrained(
-        path: str,
-        load_weights: bool = False,
-        max_length: int | None = None,
-        lora_config: LoraConfig | None = None,
-        rope_base: int | None = None,
-        is_agent: bool = False,
-    ) -> "BaseTransformer":
-        config = BaseModelArgs.from_pretrained(str(path))
-        if max_length is not None:
-            config.max_seq_len = max_length
-            log.info(f"Override max_seq_len to {max_length}")
-        if rope_base is not None:
-            config.rope_base = rope_base
-            log.info(f"Override rope_base to {rope_base}")
-        match config.model_type:
-            case "naive":
-                model_cls = NaiveTransformer
-            case "dual_ar":
-                model_cls = DualARTransformer
-            case _:
-                raise ValueError(f"Unknown model type: {config.model_type}")
-        if is_agent:
-            tokenizer = AutoTokenizer.from_pretrained(str(path))
-        else:
-            tokenizer_path = str(path) + "/tokenizer.tiktoken"
-            tokenizer = FishTokenizer(tokenizer_path)
-        log.info(f"Loading model from {path}, config: {config}")
-        model = model_cls(config, tokenizer=tokenizer)
-        if lora_config is not None:
-            setup_lora(model, lora_config)
-            log.info(f"LoRA setup: {lora_config}")
-        if load_weights is False:
-            log.info("Randomly initialized model")
-        else:
-            if "int8" in str(Path(path)):
-                logger.info("Using int8 weight-only quantization!")
-                from tools.llama.quantize import WeightOnlyInt8QuantHandler
-                simple_quantizer = WeightOnlyInt8QuantHandler(model)
-                model = simple_quantizer.convert_for_runtime()
-            if "int4" in str(Path(path)):
-                logger.info("Using int4 quantization!")
-                path_comps = path.name.split("-")
-                assert path_comps[-2].startswith("g")
-                groupsize = int(path_comps[-2][1:])
-                from tools.llama.quantize import WeightOnlyInt4QuantHandler
-                simple_quantizer = WeightOnlyInt4QuantHandler(model, groupsize)
-                model = simple_quantizer.convert_for_runtime()
-            weights = torch.load(
-                Path(path) / "model.pth",
-                map_location="cpu",
-                mmap=True,
-                weights_only=True,
-            )
-            if "state_dict" in weights:
-                logger.warning(
-                    "Using a TextToSemantic LightningModule checkpoint, "
-                    "please make sure it is a full model, not a LoRA model."
-                )
-                weights = weights["state_dict"]
-            if next(iter(weights.keys())).startswith("model."):
-                logger.info(
-                    f"Remove prefix 'model.' created by TextToSemantic LightningModule from keys"
-                )
-                new_weights = OrderedDict()
-                for k, v in weights.items():
-                    new_weights[k.replace("model.", "")] = v
-                weights = new_weights
-            # Verify the name and shape of parameters since strict=False in load_state_dict.
-            for k, v in model.named_parameters():
-                if k not in weights:
-                    logger.warning(f"No weight for {k}")
-                elif v.shape != weights[k].shape:
-                    logger.warning(
-                        f"Shape mismatch for {k}: {v.shape} vs {weights[k].shape}"
-                    )
-            err = model.load_state_dict(weights, strict=False, assign=True)
-            log.info(f"Loaded weights with error: {err}")
-        return model
-    def save_pretrained(self, path: str, drop_lora: bool = False):
-        path = Path(path)
-        path.mkdir(parents=True, exist_ok=True)
-        self.config.save(path / "config.json")
-        state_dict = self.state_dict()
-        if drop_lora:
-            for key in list(state_dict.keys()):
-                if "lora" not in key:
-                    continue
-                state_dict.pop(key)
-                log.info(f"Drop LoRA parameter: {key}")
-        torch.save(state_dict, path / "model.pth")
-        self.tokenizer.save_pretrained(path)
-class NaiveTransformer(BaseTransformer):
-    def __init__(self, config: NaiveModelArgs, tokenizer: FishTokenizer) -> None:
-        super().__init__(config, init_weights=False, tokenizer=tokenizer)
-        self.codebook_norm = RMSNorm(config.dim, eps=config.norm_eps)
-        self.codebook_output = nn.Linear(
-            config.dim,
-            config.codebook_size * config.num_codebooks,
-            bias=False,
-        )
-        self.apply(self._init_weights)
-    def decode(self, result: BaseTransformerForwardResult) -> TransformerForwardResult:
-        token_logits = result.logits
-        x = result.hidden_states
-        # Codebook
-        codebook_logits = self.codebook_output(self.codebook_norm(x))
-        codebook_logits = rearrange(
-            codebook_logits, "b n (c d) -> b n c d", c=self.config.num_codebooks
-        )
-        return TransformerForwardResult(
-            token_logits=token_logits,
-            codebook_logits=codebook_logits,
-        )
-    def forward(
-        self,
-        inp: Tensor,
-        key_padding_mask: Optional[Tensor] = None,
-    ) -> TransformerForwardResult:
-        result = super().forward(
-            inp=inp,
-            key_padding_mask=key_padding_mask,
-        )
-        return self.decode(result)
-    def forward_generate(
-        self, x: Tensor, input_pos: Optional[Tensor] = None
-    ) -> TransformerForwardResult:
-        result = super().forward_generate(x, input_pos)
-        return self.decode(result)
-class DualARTransformer(BaseTransformer):
-    def __init__(self, config: NaiveModelArgs, tokenizer: FishTokenizer) -> None:
-        super().__init__(config, init_weights=False, tokenizer=tokenizer)
-        # Project to fast dim if needed
-        if config.fast_dim is not None and config.fast_dim != config.dim:
-            self.fast_project_in = nn.Linear(config.dim, config.fast_dim)
-        else:
-            self.fast_project_in = nn.Identity()
-        # Fast transformer
-        self.fast_embeddings = nn.Embedding(config.codebook_size, config.fast_dim)
-        # The equivalent bs is so large that sdpa doesn't work
-        override_config = dataclasses.replace(
-            config,
-            dim=config.fast_dim,
-            n_head=config.fast_n_head,
-            n_local_heads=config.fast_n_local_heads,
-            head_dim=config.fast_head_dim,
-            intermediate_size=config.fast_intermediate_size,
-            attention_qkv_bias=config.fast_attention_qkv_bias,
-        )
-        self.fast_layers = nn.ModuleList(
-            TransformerBlock(override_config, use_sdpa=False)
-            for _ in range(config.n_fast_layer)
-        )
-        self.fast_norm = RMSNorm(config.fast_dim, eps=config.norm_eps)
-        self.fast_output = nn.Linear(
-            config.fast_dim,
-            config.codebook_size,
-            bias=False,
-        )
-        self.register_buffer(
-            "fast_freqs_cis",
-            precompute_freqs_cis(
-                config.num_codebooks,
-                config.fast_dim // config.fast_n_head,
-                config.rope_base,
-            ),
-            persistent=False,
-        )
-        self.apply(self._init_weights)
-    def setup_caches(
-        self, max_batch_size: int, max_seq_len: int, dtype: torch.dtype = torch.bfloat16
-    ):
-        super().setup_caches(max_batch_size, max_seq_len, dtype)
-        head_dim = self.config.fast_dim // self.config.fast_n_head
-        # Fast transformer
-        # The max seq len here is the number of codebooks
-        for b in self.fast_layers:
-            b.attention.kv_cache = KVCache(
-                max_batch_size,
-                self.config.num_codebooks,
-                self.config.fast_n_local_heads,
-                head_dim,
-                dtype=dtype,
-            )
-    def forward(
-        self,
-        inp: Tensor,
-        key_padding_mask: Optional[Tensor] = None,
-    ) -> TransformerForwardResult:
-        parent_result = super().forward(inp, key_padding_mask)
-        token_logits = parent_result.logits
-        x = parent_result.hidden_states
-        x = self.fast_project_in(x)
-        # Fast transformer
-        fast_seq_len = self.config.num_codebooks
-        fast_mask = self.causal_mask[
-            None, None, :fast_seq_len, :fast_seq_len
-        ]  # (B, N, Q, K)
-        # Drop the last token and rotate left
-        codebooks = inp[:, 1:-1, 1:]
-        codebooks = F.pad(codebooks, (0, 1), value=0)
-        codebook_embeddings = self.fast_embeddings(codebooks)
-        x = torch.cat([x[:, None], codebook_embeddings], dim=1)
-        b, s = x.size(0), x.size(2)
-        x = rearrange(x, "b n s d -> (b s) n d")  # flatten the batch and seq_len
-        # Remove padded part
-        codebooks = rearrange(codebooks, "b n s -> (b s) n")
-        codebook_mask = (codebooks == 0).all(dim=-1)
-        if torch.all(codebook_mask):
-            # If all codebooks are padded, we keep first 8 to make sure the model runs
-            codebook_mask[:8] = False
-        x_bs, x_len = x.size(0), x.size(1)
-        x = x[~codebook_mask]
-        for layer in self.fast_layers:
-            if self.config.use_gradient_checkpointing and self.training:
-                x = checkpoint(
-                    layer, x, self.fast_freqs_cis, fast_mask, use_reentrant=True
-                )
-            else:
-                x = layer(x, self.fast_freqs_cis, fast_mask)
-        # unflatten the batch and num_codebooks
-        fast_out = self.fast_norm(x)
-        codebook_logits = self.fast_output(fast_out)
-        # Re-pad the codebook_logits
-        buffer = torch.zeros(
-            x_bs,
-            x_len,
-            codebook_logits.size(-1),
-            device=codebook_logits.device,
-            dtype=codebook_logits.dtype,
-        )
-        buffer[~codebook_mask] = codebook_logits
-        codebook_logits = buffer
-        assert codebook_logits.shape[1] == self.config.num_codebooks
-        codebook_logits = rearrange(
-            codebook_logits,
-            "(b s) n d -> b s n d",
-            b=b,
-            s=s,
-            n=self.config.num_codebooks,
-        )
-        return TransformerForwardResult(
-            token_logits=token_logits,
-            codebook_logits=codebook_logits,
-        )
-    def forward_generate_fast(
-        self, x: Tensor, input_pos: Optional[Tensor] = None
-    ) -> Tensor:
-        # Fast transformer
-        x = x.view(1, 1, -1)
-        fast_mask = self.causal_mask[
-            None, None, input_pos, : self.config.num_codebooks
-        ]  # (B, N, Q, K)
-        fast_freqs_cis = self.fast_freqs_cis[input_pos]
-        for layer in self.fast_layers:
-            x = layer(x, fast_freqs_cis, fast_mask, input_pos=input_pos)
-        # unflatten the batch and num_codebooks
-        fast_out = self.fast_norm(x)  # only take the last token
-        codebook_logits = self.fast_output(fast_out)
-        return codebook_logits
-    def forward_generate(
-        self,
-        x: Tensor,
-        input_pos: Optional[Tensor] = None,
-        vq_masks: Optional[Tensor] = None,
-    ) -> TransformerForwardResult:
-        x = super().forward_generate(x, input_pos, vq_masks)
-        x.hidden_states = self.fast_project_in(x.hidden_states)
-        return x
-class TransformerBlock(nn.Module):
-    def __init__(self, config: BaseModelArgs, use_sdpa: bool = True) -> None:
-        super().__init__()
-        self.attention = Attention(config, use_sdpa=use_sdpa)
-        self.feed_forward = FeedForward(config)
-        self.ffn_norm = RMSNorm(config.dim, config.norm_eps)
-        self.attention_norm = RMSNorm(config.dim, config.norm_eps)
-    def forward(
-        self, x: Tensor, freqs_cis: Tensor, mask: Tensor, input_pos: Tensor = None
-    ) -> Tensor:
-        h = x + self.attention(self.attention_norm(x), freqs_cis, mask, input_pos)
-        out = h + self.feed_forward(self.ffn_norm(h))
-        return out
-class Attention(nn.Module):
-    def __init__(self, config: BaseModelArgs, use_sdpa: bool = True):
-        super().__init__()
-        assert config.dim % config.n_head == 0
-        total_head_dim = (config.n_head + 2 * config.n_local_heads) * config.head_dim
-        # key, query, value projections for all heads, but in a batch
-        self.wqkv = nn.Linear(
-            config.dim, total_head_dim, bias=config.attention_qkv_bias
-        )
-        self.wo = nn.Linear(config.dim, config.dim, bias=False)
-        self.kv_cache = None
-        self.dropout = config.dropout
-        self.n_head = config.n_head
-        self.head_dim = config.head_dim
-        self.n_local_heads = config.n_local_heads
-        self.dim = config.dim
-        self.use_sdpa = use_sdpa
-        self._register_load_state_dict_pre_hook(self.load_hook)
-    def load_hook(self, state_dict, prefix, *args):
-        if prefix + "wq.weight" in state_dict:
-            wq = state_dict.pop(prefix + "wq.weight")
-            wk = state_dict.pop(prefix + "wk.weight")
-            wv = state_dict.pop(prefix + "wv.weight")
-            state_dict[prefix + "wqkv.weight"] = torch.cat([wq, wk, wv])
-    def forward(
-        self,
-        x: Tensor,
-        freqs_cis: Tensor,
-        mask: Tensor,
-        input_pos: Optional[Tensor] = None,
-    ) -> Tensor:
-        bsz, seqlen, _ = x.shape
-        kv_size = self.n_local_heads * self.head_dim
-        q, k, v = self.wqkv(x).split([self.dim, kv_size, kv_size], dim=-1)
-        q = q.view(bsz, seqlen, self.n_head, self.head_dim)
-        k = k.view(bsz, seqlen, self.n_local_heads, self.head_dim)
-        v = v.view(bsz, seqlen, self.n_local_heads, self.head_dim)
-        q = apply_rotary_emb(q, freqs_cis)
-        k = apply_rotary_emb(k, freqs_cis)
-        q, k, v = map(lambda x: x.transpose(1, 2), (q, k, v))
-        if self.kv_cache is not None:
-            k, v = self.kv_cache.update(input_pos, k, v)
-        k = k.repeat_interleave(self.n_head // self.n_local_heads, dim=1)
-        v = v.repeat_interleave(self.n_head // self.n_local_heads, dim=1)
-        if self.use_sdpa:
-            if mask is None:
-                with sdpa_kernel(SDPBackend.FLASH_ATTENTION):
-                    y = F.scaled_dot_product_attention(
-                        q,
-                        k,
-                        v,
-                        dropout_p=self.dropout if self.training else 0.0,
-                        is_causal=True,
-                        # No third party attn_mask here to use flash_attention
-                    )
-            else:
-                y = F.scaled_dot_product_attention(
-                    q,
-                    k,
-                    v,
-                    attn_mask=mask,
-                    dropout_p=self.dropout if self.training else 0.0,
-                )
-        else:
-            y = self.eq_scaled_dot_product_attention(
-                q,
-                k,
-                v,
-                attn_mask=mask,
-                dropout_p=self.dropout if self.training else 0.0,
-            )
-        y = y.transpose(1, 2).contiguous().view(bsz, seqlen, self.dim)
-        return self.wo(y)
-    def eq_scaled_dot_product_attention(
-        self,
-        query,
-        key,
-        value,
-        attn_mask=None,
-        dropout_p=0.0,
-    ) -> torch.Tensor:
-        # This is a standard scaled dot product attention
-        # It's low efficient, but it doesn't raise cuda error
-        L, S = query.size(-2), key.size(-2)
-        scale_factor = 1 / math.sqrt(query.size(-1))
-        attn_bias = torch.zeros(1, 1, L, S, dtype=query.dtype, device=query.device)
-        if attn_mask is not None:
-            if attn_mask.dtype == torch.bool:
-                attn_bias.masked_fill_(attn_mask.logical_not(), float("-inf"))
-            else:
-                attn_bias += attn_mask
-        attn_weight = query @ key.transpose(-2, -1) * scale_factor
-        attn_weight += attn_bias
-        attn_weight = torch.softmax(attn_weight, dim=-1)
-        attn_weight = torch.dropout(attn_weight, dropout_p, train=True)
-        return attn_weight @ value
-class FeedForward(nn.Module):
-    def __init__(self, config: BaseModelArgs) -> None:
-        super().__init__()
-        self.w1 = nn.Linear(config.dim, config.intermediate_size, bias=False)
-        self.w3 = nn.Linear(config.dim, config.intermediate_size, bias=False)
-        self.w2 = nn.Linear(config.intermediate_size, config.dim, bias=False)
-    def forward(self, x: Tensor) -> Tensor:
-        return self.w2(F.silu(self.w1(x)) * self.w3(x))
-class RMSNorm(nn.Module):
-    def __init__(self, dim: int, eps: float = 1e-5):
-        super().__init__()
-        self.eps = eps
-        self.weight = nn.Parameter(torch.ones(dim))
-    def _norm(self, x):
-        return x * torch.rsqrt(torch.mean(x * x, dim=-1, keepdim=True) + self.eps)
-    def forward(self, x: Tensor) -> Tensor:
-        output = self._norm(x.float()).type_as(x)
-        return output * self.weight
-def precompute_freqs_cis(seq_len: int, n_elem: int, base: int = 10000) -> Tensor:
-    freqs = 1.0 / (
-        base ** (torch.arange(0, n_elem, 2)[: (n_elem // 2)].float() / n_elem)
-    )
-    t = torch.arange(seq_len, device=freqs.device)
-    freqs = torch.outer(t, freqs)
-    freqs_cis = torch.polar(torch.ones_like(freqs), freqs)
-    cache = torch.stack([freqs_cis.real, freqs_cis.imag], dim=-1)
-    return cache.to(dtype=torch.bfloat16)
-def apply_rotary_emb(x: Tensor, freqs_cis: Tensor) -> Tensor:
-    xshaped = x.float().reshape(*x.shape[:-1], -1, 2)
-    freqs_cis = freqs_cis.view(1, xshaped.size(1), 1, xshaped.size(3), 2)
-    x_out2 = torch.stack(
-        [
-            xshaped[..., 0] * freqs_cis[..., 0] - xshaped[..., 1] * freqs_cis[..., 1],
-            xshaped[..., 1] * freqs_cis[..., 0] + xshaped[..., 0] * freqs_cis[..., 1],
-        ],
-        -1,
-    )
-    x_out2 = x_out2.flatten(3)
-    return x_out2.type_as(x)

+import dataclasses
+import json
+import math
+from collections import OrderedDict
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Optional
+import torch
+import torch.nn as nn
+from einops import rearrange
+from loguru import logger
+from torch import Tensor
+from torch.nn import functional as F
+from torch.nn.attention import SDPBackend, sdpa_kernel
+from torch.utils.checkpoint import checkpoint
+from transformers import AutoTokenizer
+from fish_speech.models.text2semantic.lora import LoraConfig, setup_lora
+from fish_speech.tokenizer import SEMANTIC_TOKENS, FishTokenizer
+def find_multiple(n: int, k: int) -> int:
+    if n % k == 0:
+        return n
+    return n + k - (n % k)
+@dataclass
+class BaseModelArgs:
+    model_type: str = "base"
+    vocab_size: int = 32000
+    n_layer: int = 32
+    n_head: int = 32
+    dim: int = 4096
+    intermediate_size: int = None
+    n_local_heads: int = -1
+    head_dim: int = 64
+    rope_base: float = 10000
+    norm_eps: float = 1e-5
+    max_seq_len: int = 2048
+    dropout: float = 0.0
+    tie_word_embeddings: bool = True
+    attention_qkv_bias: bool = False
+    attention_o_bias: bool = False
+    attention_qk_norm: bool = False
+    # Codebook configs
+    codebook_size: int = 160
+    num_codebooks: int = 4
+    # Gradient checkpointing
+    use_gradient_checkpointing: bool = True
+    # Initialize the model
+    initializer_range: float = 0.02
+    # Dummy vars
+    is_reward_model: bool = False
+    scale_codebook_embeddings: bool = False
+    def __post_init__(self):
+        if self.n_local_heads == -1:
+            self.n_local_heads = self.n_head
+        if self.intermediate_size is None:
+            hidden_dim = 4 * self.dim
+            n_hidden = int(2 * hidden_dim / 3)
+            self.intermediate_size = find_multiple(n_hidden, 256)
+        if self.head_dim is None:
+            self.head_dim = self.dim // self.n_head
+    @staticmethod
+    def from_pretrained(path: str):
+        path = Path(path)
+        if path.is_dir():
+            path = path / "config.json"
+        with open(path, "r", encoding="utf-8") as f:
+            data = json.load(f)
+        match data["model_type"]:
+            case "naive":
+                cls = NaiveModelArgs
+            case "dual_ar":
+                cls = DualARModelArgs
+            case _:
+                raise ValueError(f"Unknown model type: {data['model_type']}")
+        return cls(**data)
+    def save(self, path: str):
+        with open(path, "w") as f:
+            json.dump(self.__dict__, f, indent=4, sort_keys=True, ensure_ascii=False)
+@dataclass
+class NaiveModelArgs(BaseModelArgs):
+    model_type: str = "naive"
+@dataclass
+class DualARModelArgs(BaseModelArgs):
+    model_type: str = "dual_ar"
+    n_fast_layer: int = 4
+    fast_dim: int | None = None
+    fast_n_head: int | None = None
+    fast_n_local_heads: int | None = None
+    fast_head_dim: int | None = None
+    fast_intermediate_size: int | None = None
+    fast_attention_qkv_bias: bool | None = None
+    fast_attention_qk_norm: bool | None = None
+    fast_attention_o_bias: bool | None = None
+    def __post_init__(self):
+        super().__post_init__()
+        self.fast_dim = self.fast_dim or self.dim
+        self.fast_n_head = self.fast_n_head or self.n_head
+        self.fast_n_local_heads = self.fast_n_local_heads or self.n_local_heads
+        self.fast_head_dim = self.fast_head_dim or self.head_dim
+        self.fast_intermediate_size = (
+            self.fast_intermediate_size or self.intermediate_size
+        )
+        self.fast_attention_qkv_bias = (
+            self.fast_attention_qkv_bias
+            if self.fast_attention_qkv_bias is not None
+            else self.attention_qkv_bias
+        )
+        self.fast_attention_qk_norm = (
+            self.fast_attention_qk_norm
+            if self.fast_attention_qk_norm is not None
+            else self.attention_qk_norm
+        )
+        self.fast_attention_o_bias = (
+            self.fast_attention_o_bias
+            if self.fast_attention_o_bias is not None
+            else self.attention_o_bias
+        )
+class KVCache(nn.Module):
+    def __init__(
+        self, max_batch_size, max_seq_len, n_heads, head_dim, dtype=torch.bfloat16
+    ):
+        super().__init__()
+        cache_shape = (max_batch_size, n_heads, max_seq_len, head_dim)
+        self.register_buffer("k_cache", torch.zeros(cache_shape, dtype=dtype))
+        self.register_buffer("v_cache", torch.zeros(cache_shape, dtype=dtype))
+    def update(self, input_pos, k_val, v_val):
+        # input_pos: [S], k_val: [B, H, S, D]
+        assert input_pos.shape[0] == k_val.shape[2]
+        k_out = self.k_cache
+        v_out = self.v_cache
+        k_out[:, :, input_pos] = k_val
+        v_out[:, :, input_pos] = v_val
+        return k_out, v_out
+@dataclass
+class TransformerForwardResult:
+    token_logits: Tensor
+    codebook_logits: Tensor
+@dataclass
+class BaseTransformerForwardResult:
+    logits: Tensor
+    hidden_states: Tensor
+class BaseTransformer(nn.Module):
+    def __init__(
+        self,
+        config: BaseModelArgs,
+        tokenizer: FishTokenizer,
+        init_weights: bool = True,
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.tokenizer = tokenizer
+        self.semantic_token_ids = list(tokenizer.semantic_id_to_token_id.values())
+        # Slow transformer
+        self.embeddings = nn.Embedding(
+            config.vocab_size,
+            config.dim,
+        )
+        self.codebook_embeddings = nn.Embedding(
+            config.codebook_size * config.num_codebooks,
+            config.dim,
+        )
+        self.layers = nn.ModuleList(
+            TransformerBlock(config, use_sdpa=True) for _ in range(config.n_layer)
+        )
+        self.norm = RMSNorm(config.dim, eps=config.norm_eps)
+        if self.config.tie_word_embeddings is False:
+            self.output = nn.Linear(
+                config.dim,
+                config.vocab_size,
+                bias=False,
+            )
+        self.register_buffer(
+            "freqs_cis",
+            precompute_freqs_cis(
+                config.max_seq_len,
+                config.head_dim,
+                config.rope_base,
+            ),
+            persistent=False,
+        )
+        self.register_buffer(
+            "causal_mask",
+            torch.tril(
+                torch.ones(
+                    config.max_seq_len,
+                    config.max_seq_len,
+                    dtype=torch.bool,
+                )
+            ),
+            persistent=False,
+        )
+        # For kv cache
+        self.max_batch_size = -1
+        self.max_seq_len = -1
+        if init_weights:
+            self.apply(self._init_weights)
+    def setup_caches(
+        self, max_batch_size: int, max_seq_len: int, dtype: torch.dtype = torch.bfloat16
+    ):
+        if self.max_seq_len >= max_seq_len and self.max_batch_size >= max_batch_size:
+            return
+        max_seq_len = find_multiple(max_seq_len, 8)
+        self.max_seq_len = max_seq_len
+        self.max_batch_size = max_batch_size
+        for b in self.layers:
+            b.attention.kv_cache = KVCache(
+                max_batch_size,
+                max_seq_len,
+                self.config.n_local_heads,
+                self.config.head_dim,
+                dtype=dtype,
+            )
+    def embed(self, inp: Tensor) -> Tensor:
+        embeds = []
+        semantic_token_ids_tensor = torch.tensor(
+            self.semantic_token_ids, device=inp.device, dtype=inp.dtype
+        )
+        for i in range(self.config.num_codebooks):
+            emb = self.codebook_embeddings(
+                inp[:, i + 1] + i * self.config.codebook_size
+            )
+            embeds.append(emb)
+        vq_embeds_sum = torch.stack(embeds, dim=1).sum(dim=1)
+        vq_embeds_sum[~torch.isin(inp[:, 0], semantic_token_ids_tensor)] = 0
+        x = self.embeddings(inp[:, 0]) + vq_embeds_sum
+        return x
+    def forward(
+        self,
+        inp: Tensor,
+        key_padding_mask: Optional[Tensor] = None,
+    ) -> BaseTransformerForwardResult:
+        seq_len = inp.size(2)
+        # Here we want to merge the embeddings of the codebooks
+        x = self.embed(inp)
+        freqs_cis = self.freqs_cis[:seq_len]
+        # Not that the causal mask here follows the definition of scaled_dot_product_attention
+        # That is, FALSE means masked out
+        # To maintain consistency, key_padding_mask use TRUE to mask out
+        mask = None
+        if key_padding_mask is not None:
+            causal = self.causal_mask[:seq_len, :seq_len]
+            causal = rearrange(causal, "q k -> 1 1 q k")
+            atten_mask = rearrange(key_padding_mask, "b s -> b 1 1 s")
+            atten_mask = atten_mask.logical_not()
+            mask = causal & atten_mask
+        # return freqs_cis, mask
+        for layer in self.layers:
+            if self.config.use_gradient_checkpointing and self.training:
+                x = checkpoint(layer, x, freqs_cis, mask, use_reentrant=True)
+            else:
+                x = layer(x, freqs_cis, mask)
+        # We got slow_out here
+        slow_out = self.norm(x)
+        if self.config.tie_word_embeddings:
+            token_logits = F.linear(slow_out, self.embeddings.weight)
+        else:
+            token_logits = self.output(slow_out)
+        return BaseTransformerForwardResult(
+            logits=token_logits,
+            hidden_states=x,
+        )
+    def forward_generate(
+        self,
+        inp: Tensor,
+        input_pos: Optional[Tensor] = None,
+        return_all: bool = False,
+    ) -> BaseTransformerForwardResult:
+        x = self.embed(inp)
+        if input_pos is None:
+            input_pos = torch.arange(inp.shape[-1], device=x.device)
+            max_seq_len = inp.shape[-1]
+        else:
+            max_seq_len = self.max_seq_len
+        mask = self.causal_mask[None, None, input_pos, :max_seq_len]  # (B, N, Q, K)
+        freqs_cis = self.freqs_cis[input_pos]
+        for layer in self.layers:
+            x = layer(x, freqs_cis, mask, input_pos=input_pos)
+        # If prefill, we only calculate the logits of last token
+        if x.size(1) > 1 and not return_all:
+            x = x[:, -1:]
+        # We got slow_out here
+        slow_out = self.norm(x)
+        if self.config.is_reward_model:
+            token_logits = self.score_output(slow_out)
+        elif self.config.tie_word_embeddings:
+            token_logits = F.linear(slow_out, self.embeddings.weight)
+        else:
+            token_logits = self.output(slow_out)
+        return BaseTransformerForwardResult(
+            logits=token_logits,
+            hidden_states=x,
+        )
+    def _init_weights(self, module):
+        std = self.config.initializer_range
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+    @staticmethod
+    def from_pretrained(
+        path: str,
+        load_weights: bool = False,
+        max_length: int | None = None,
+        lora_config: LoraConfig | None = None,
+        rope_base: int | None = None,
+    ) -> "BaseTransformer":
+        config = BaseModelArgs.from_pretrained(str(path))
+        if max_length is not None:
+            config.max_seq_len = max_length
+            logger.info(f"Override max_seq_len to {max_length}")
+        if rope_base is not None:
+            config.rope_base = rope_base
+            logger.info(f"Override rope_base to {rope_base}")
+        match config.model_type:
+            case "naive":
+                model_cls = NaiveTransformer
+            case "dual_ar":
+                model_cls = DualARTransformer
+            case _:
+                raise ValueError(f"Unknown model type: {config.model_type}")
+        tokenizer = FishTokenizer.from_pretrained(path)
+        logger.info(f"Loading model from {path}, config: {config}")
+        model = model_cls(config, tokenizer=tokenizer)
+        if lora_config is not None:
+            setup_lora(model, lora_config)
+            logger.info(f"LoRA setup: {lora_config}")
+        if load_weights is False:
+            logger.info("Randomly initialized model")
+        else:
+            if "int8" in str(Path(path)):
+                logger.info("Using int8 weight-only quantization!")
+                from tools.llama.quantize import WeightOnlyInt8QuantHandler
+                simple_quantizer = WeightOnlyInt8QuantHandler(model)
+                model = simple_quantizer.convert_for_runtime()
+            if "int4" in str(Path(path)):
+                logger.info("Using int4 quantization!")
+                path_comps = path.name.split("-")
+                assert path_comps[-2].startswith("g")
+                groupsize = int(path_comps[-2][1:])
+                from tools.llama.quantize import WeightOnlyInt4QuantHandler
+                simple_quantizer = WeightOnlyInt4QuantHandler(model, groupsize)
+                model = simple_quantizer.convert_for_runtime()
+            weights = torch.load(
+                Path(path) / "model.pth",
+                map_location="cpu",
+                mmap=True,
+                weights_only=True,
+            )
+            if "state_dict" in weights:
+                logger.warning(
+                    "Using a TextToSemantic LightningModule checkpoint, "
+                    "please make sure it is a full model, not a LoRA model."
+                )
+                weights = weights["state_dict"]
+            if next(iter(weights.keys())).startswith("model."):
+                logger.info(
+                    f"Remove prefix 'model.' created by TextToSemantic LightningModule from keys"
+                )
+                new_weights = OrderedDict()
+                for k, v in weights.items():
+                    new_weights[k.replace("model.", "")] = v
+                weights = new_weights
+            # Remove audio related weights
+            for k in list(weights.keys()):
+                if "audio_" in k:
+                    weights.pop(k)
+            # Verify the name and shape of parameters since strict=False in load_state_dict.
+            for k, v in model.named_parameters():
+                if k not in weights:
+                    logger.warning(f"No weight for {k}")
+                elif v.shape != weights[k].shape:
+                    logger.warning(
+                        f"Shape mismatch for {k}: {v.shape} vs {weights[k].shape}"
+                    )
+            err = model.load_state_dict(weights, strict=False, assign=True)
+            logger.info(f"Loaded weights with error: {err}")
+        return model
+    def save_pretrained(self, path: str, drop_lora: bool = False):
+        path = Path(path)
+        path.mkdir(parents=True, exist_ok=True)
+        self.config.save(path / "config.json")
+        state_dict = self.state_dict()
+        if drop_lora:
+            for key in list(state_dict.keys()):
+                if "lora" not in key:
+                    continue
+                state_dict.pop(key)
+                logger.info(f"Drop LoRA parameter: {key}")
+        torch.save(state_dict, path / "model.pth")
+        self.tokenizer.save_pretrained(path)
+class NaiveTransformer(BaseTransformer):
+    def __init__(self, config: NaiveModelArgs, tokenizer: FishTokenizer) -> None:
+        super().__init__(config, init_weights=False, tokenizer=tokenizer)
+        self.codebook_norm = RMSNorm(config.dim, eps=config.norm_eps)
+        self.codebook_output = nn.Linear(
+            config.dim,
+            config.codebook_size * config.num_codebooks,
+            bias=False,
+        )
+        self.apply(self._init_weights)
+    def decode(self, result: BaseTransformerForwardResult) -> TransformerForwardResult:
+        token_logits = result.logits
+        x = result.hidden_states
+        # Codebook
+        codebook_logits = self.codebook_output(self.codebook_norm(x))
+        codebook_logits = rearrange(
+            codebook_logits, "b n (c d) -> b n c d", c=self.config.num_codebooks
+        )
+        return TransformerForwardResult(
+            token_logits=token_logits,
+            codebook_logits=codebook_logits,
+        )
+    def forward(
+        self,
+        inp: Tensor,
+        key_padding_mask: Optional[Tensor] = None,
+    ) -> TransformerForwardResult:
+        result = super().forward(
+            inp=inp,
+            key_padding_mask=key_padding_mask,
+        )
+        return self.decode(result)
+    def forward_generate(
+        self, x: Tensor, input_pos: Optional[Tensor] = None
+    ) -> TransformerForwardResult:
+        result = super().forward_generate(x, input_pos)
+        return self.decode(result)
+class DualARTransformer(BaseTransformer):
+    def __init__(self, config: NaiveModelArgs, tokenizer: FishTokenizer) -> None:
+        super().__init__(config, init_weights=False, tokenizer=tokenizer)
+        # Project to fast dim if needed
+        if config.fast_dim is not None and config.fast_dim != config.dim:
+            self.fast_project_in = nn.Linear(config.dim, config.fast_dim)
+        else:
+            self.fast_project_in = nn.Identity()
+        # Fast transformer
+        self.fast_embeddings = nn.Embedding(config.codebook_size, config.fast_dim)
+        # The equivalent bs is so large that sdpa doesn't work
+        override_config = dataclasses.replace(
+            config,
+            dim=config.fast_dim,
+            n_head=config.fast_n_head,
+            n_local_heads=config.fast_n_local_heads,
+            head_dim=config.fast_head_dim,
+            intermediate_size=config.fast_intermediate_size,
+            attention_qkv_bias=config.fast_attention_qkv_bias,
+            attention_qk_norm=config.fast_attention_qk_norm,
+            attention_o_bias=config.fast_attention_o_bias,
+        )
+        self.fast_layers = nn.ModuleList(
+            TransformerBlock(override_config, use_sdpa=False)
+            for _ in range(config.n_fast_layer)
+        )
+        self.fast_norm = RMSNorm(config.fast_dim, eps=config.norm_eps)
+        self.fast_output = nn.Linear(
+            config.fast_dim,
+            config.codebook_size,
+            bias=False,
+        )
+        self.register_buffer(
+            "fast_freqs_cis",
+            precompute_freqs_cis(
+                config.num_codebooks,
+                config.fast_head_dim,
+                config.rope_base,
+            ),
+            persistent=False,
+        )
+        self.apply(self._init_weights)
+    def setup_caches(
+        self, max_batch_size: int, max_seq_len: int, dtype: torch.dtype = torch.bfloat16
+    ):
+        super().setup_caches(max_batch_size, max_seq_len, dtype)
+        # Fast transformer
+        # The max seq len here is the number of codebooks
+        for b in self.fast_layers:
+            b.attention.kv_cache = KVCache(
+                max_batch_size,
+                self.config.num_codebooks,
+                self.config.fast_n_local_heads,
+                self.config.fast_head_dim,
+                dtype=dtype,
+            )
+    def forward(
+        self,
+        inp: Tensor,
+        key_padding_mask: Optional[Tensor] = None,
+    ) -> TransformerForwardResult:
+        parent_result = super().forward(inp, key_padding_mask)
+        token_logits = parent_result.logits
+        x = parent_result.hidden_states
+        x = self.fast_project_in(x)
+        # Fast transformer
+        fast_seq_len = self.config.num_codebooks
+        fast_mask = self.causal_mask[
+            None, None, :fast_seq_len, :fast_seq_len
+        ]  # (B, N, Q, K)
+        # Drop the last token and rotate left
+        codebooks = inp[:, 1:-1, 1:]
+        codebooks = F.pad(codebooks, (0, 1), value=0)
+        codebook_embeddings = self.fast_embeddings(codebooks)
+        x = torch.cat([x[:, None], codebook_embeddings], dim=1)
+        b, s = x.size(0), x.size(2)
+        x = rearrange(x, "b n s d -> (b s) n d")  # flatten the batch and seq_len
+        # Remove padded part
+        codebooks = rearrange(codebooks, "b n s -> (b s) n")
+        codebook_mask = (codebooks == 0).all(dim=-1)
+        if torch.all(codebook_mask):
+            # If all codebooks are padded, we keep first 8 to make sure the model runs
+            codebook_mask[:8] = False
+        x_bs, x_len = x.size(0), x.size(1)
+        x = x[~codebook_mask]
+        for layer in self.fast_layers:
+            if self.config.use_gradient_checkpointing and self.training:
+                x = checkpoint(
+                    layer, x, self.fast_freqs_cis, fast_mask, use_reentrant=True
+                )
+            else:
+                x = layer(x, self.fast_freqs_cis, fast_mask)
+        # unflatten the batch and num_codebooks
+        fast_out = self.fast_norm(x)
+        codebook_logits = self.fast_output(fast_out)
+        # Re-pad the codebook_logits
+        buffer = torch.zeros(
+            x_bs,
+            x_len,
+            codebook_logits.size(-1),
+            device=codebook_logits.device,
+            dtype=codebook_logits.dtype,
+        )
+        buffer[~codebook_mask] = codebook_logits
+        codebook_logits = buffer
+        assert codebook_logits.shape[1] == self.config.num_codebooks
+        codebook_logits = rearrange(
+            codebook_logits,
+            "(b s) n d -> b s n d",
+            b=b,
+            s=s,
+            n=self.config.num_codebooks,
+        )
+        return TransformerForwardResult(
+            token_logits=token_logits,
+            codebook_logits=codebook_logits,
+        )
+    def forward_generate_fast(
+        self, x: Tensor, input_pos: Optional[Tensor] = None
+    ) -> Tensor:
+        # Fast transformer
+        x = x.view(1, 1, -1)
+        fast_mask = self.causal_mask[
+            None, None, input_pos, : self.config.num_codebooks
+        ]  # (B, N, Q, K)
+        fast_freqs_cis = self.fast_freqs_cis[input_pos]
+        for layer in self.fast_layers:
+            x = layer(x, fast_freqs_cis, fast_mask, input_pos=input_pos)
+        # unflatten the batch and num_codebooks
+        fast_out = self.fast_norm(x)  # only take the last token
+        codebook_logits = self.fast_output(fast_out)
+        return codebook_logits
+    def forward_generate(
+        self,
+        x: Tensor,
+        input_pos: Optional[Tensor] = None,
+        vq_masks: Optional[Tensor] = None,
+    ) -> TransformerForwardResult:
+        x = super().forward_generate(x, input_pos, vq_masks)
+        x.hidden_states = self.fast_project_in(x.hidden_states)
+        return x
+class TransformerBlock(nn.Module):
+    def __init__(self, config: BaseModelArgs, use_sdpa: bool = True) -> None:
+        super().__init__()
+        self.attention = Attention(config, use_sdpa=use_sdpa)
+        self.feed_forward = FeedForward(config)
+        self.ffn_norm = RMSNorm(config.dim, config.norm_eps)
+        self.attention_norm = RMSNorm(config.dim, config.norm_eps)
+    def forward(
+        self, x: Tensor, freqs_cis: Tensor, mask: Tensor, input_pos: Tensor = None
+    ) -> Tensor:
+        h = x + self.attention(self.attention_norm(x), freqs_cis, mask, input_pos)
+        out = h + self.feed_forward(self.ffn_norm(h))
+        return out
+class Attention(nn.Module):
+    def __init__(self, config: BaseModelArgs, use_sdpa: bool = True):
+        super().__init__()
+        assert config.dim % config.n_head == 0
+        total_head_dim = (config.n_head + 2 * config.n_local_heads) * config.head_dim
+        # key, query, value projections for all heads, but in a batch
+        self.wqkv = nn.Linear(
+            config.dim, total_head_dim, bias=config.attention_qkv_bias
+        )
+        self.wo = nn.Linear(
+            config.n_head * config.head_dim, config.dim, bias=config.attention_o_bias
+        )
+        self.kv_cache = None
+        if config.attention_qk_norm:
+            self.q_norm = nn.RMSNorm(config.head_dim, config.norm_eps)
+            self.k_norm = nn.RMSNorm(config.head_dim, config.norm_eps)
+        self.dropout = config.dropout
+        self.n_head = config.n_head
+        self.head_dim = config.head_dim
+        self.n_local_heads = config.n_local_heads
+        self.dim = config.dim
+        self.use_sdpa = use_sdpa
+        self.attention_qk_norm = config.attention_qk_norm
+        self.config = config
+        self._register_load_state_dict_pre_hook(self.load_hook)
+    def load_hook(self, state_dict, prefix, *args):
+        if prefix + "wq.weight" in state_dict:
+            wq = state_dict.pop(prefix + "wq.weight")
+            wk = state_dict.pop(prefix + "wk.weight")
+            wv = state_dict.pop(prefix + "wv.weight")
+            state_dict[prefix + "wqkv.weight"] = torch.cat([wq, wk, wv])
+    def forward(
+        self,
+        x: Tensor,
+        freqs_cis: Tensor,
+        mask: Tensor,
+        input_pos: Optional[Tensor] = None,
+    ) -> Tensor:
+        bsz, seqlen, _ = x.shape
+        q_size = self.n_head * self.head_dim
+        kv_size = self.n_local_heads * self.head_dim
+        q, k, v = self.wqkv(x).split([q_size, kv_size, kv_size], dim=-1)
+        q = q.view(bsz, seqlen, self.n_head, self.head_dim)
+        k = k.view(bsz, seqlen, self.n_local_heads, self.head_dim)
+        v = v.view(bsz, seqlen, self.n_local_heads, self.head_dim)
+        if self.attention_qk_norm:
+            q = self.q_norm(q)
+            k = self.k_norm(k)
+        q = apply_rotary_emb(q, freqs_cis)
+        k = apply_rotary_emb(k, freqs_cis)
+        q, k, v = map(lambda x: x.transpose(1, 2), (q, k, v))
+        if self.kv_cache is not None:
+            k, v = self.kv_cache.update(input_pos, k, v)
+        k = k.repeat_interleave(self.n_head // self.n_local_heads, dim=1)
+        v = v.repeat_interleave(self.n_head // self.n_local_heads, dim=1)
+        if self.use_sdpa:
+            if mask is None:
+                with sdpa_kernel(SDPBackend.FLASH_ATTENTION):
+                    y = F.scaled_dot_product_attention(
+                        q,
+                        k,
+                        v,
+                        dropout_p=self.dropout if self.training else 0.0,
+                        is_causal=True,
+                        # No third party attn_mask here to use flash_attention
+                    )
+            else:
+                y = F.scaled_dot_product_attention(
+                    q,
+                    k,
+                    v,
+                    attn_mask=mask,
+                    dropout_p=self.dropout if self.training else 0.0,
+                )
+        else:
+            y = self.eq_scaled_dot_product_attention(
+                q,
+                k,
+                v,
+                attn_mask=mask,
+                dropout_p=self.dropout if self.training else 0.0,
+            )
+        y = y.transpose(1, 2).contiguous().view(bsz, seqlen, q_size)
+        return self.wo(y)
+    def eq_scaled_dot_product_attention(
+        self,
+        query,
+        key,
+        value,
+        attn_mask=None,
+        dropout_p=0.0,
+    ) -> torch.Tensor:
+        # This is a standard scaled dot product attention
+        # It's low efficient, but it doesn't raise cuda error
+        L, S = query.size(-2), key.size(-2)
+        scale_factor = 1 / math.sqrt(query.size(-1))
+        attn_bias = torch.zeros(1, 1, L, S, dtype=query.dtype, device=query.device)
+        if attn_mask is not None:
+            if attn_mask.dtype == torch.bool:
+                attn_bias.masked_fill_(attn_mask.logical_not(), float("-inf"))
+            else:
+                attn_bias += attn_mask
+        attn_weight = query @ key.transpose(-2, -1) * scale_factor
+        attn_weight += attn_bias
+        attn_weight = torch.softmax(attn_weight, dim=-1)
+        attn_weight = torch.dropout(attn_weight, dropout_p, train=True)
+        return attn_weight @ value
+class FeedForward(nn.Module):
+    def __init__(self, config: BaseModelArgs) -> None:
+        super().__init__()
+        self.w1 = nn.Linear(config.dim, config.intermediate_size, bias=False)
+        self.w3 = nn.Linear(config.dim, config.intermediate_size, bias=False)
+        self.w2 = nn.Linear(config.intermediate_size, config.dim, bias=False)
+    def forward(self, x: Tensor) -> Tensor:
+        return self.w2(F.silu(self.w1(x)) * self.w3(x))
+class RMSNorm(nn.Module):
+    def __init__(self, dim: int, eps: float = 1e-5):
+        super().__init__()
+        self.eps = eps
+        self.weight = nn.Parameter(torch.ones(dim))
+    def _norm(self, x):
+        return x * torch.rsqrt(torch.mean(x * x, dim=-1, keepdim=True) + self.eps)
+    def forward(self, x: Tensor) -> Tensor:
+        output = self._norm(x.float()).type_as(x)
+        return output * self.weight
+def precompute_freqs_cis(seq_len: int, n_elem: int, base: int = 10000) -> Tensor:
+    """
+    Precomputes frequency tensors for complex exponentials (cis)
+    Args:
+        seq_len: Length of the sequence for which positional embeddings are needed.
+        n_elem: Number of elements in the frequency tensor.
+        base: Base value for the frequency scaling (default: 10000).
+    Returns:
+        A tensor containing the precomputed frequencies in real and imaginary parts (bfloat16).
+    """
+    freqs = 1.0 / (
+        base ** (torch.arange(0, n_elem, 2)[: (n_elem // 2)].float() / n_elem)
+    )
+    t = torch.arange(seq_len, device=freqs.device)
+    freqs = torch.outer(t, freqs)
+    freqs_cis = torch.polar(torch.ones_like(freqs), freqs)
+    cache = torch.stack([freqs_cis.real, freqs_cis.imag], dim=-1)
+    return cache.to(dtype=torch.bfloat16)
+def apply_rotary_emb(x: Tensor, freqs_cis: Tensor) -> Tensor:
+    xshaped = x.float().reshape(*x.shape[:-1], -1, 2)
+    freqs_cis = freqs_cis.view(1, xshaped.size(1), 1, xshaped.size(3), 2)
+    x_out2 = torch.stack(
+        [
+            xshaped[..., 0] * freqs_cis[..., 0] - xshaped[..., 1] * freqs_cis[..., 1],
+            xshaped[..., 1] * freqs_cis[..., 0] + xshaped[..., 0] * freqs_cis[..., 1],
+        ],
+        -1,
+    )
+    x_out2 = x_out2.flatten(3)
+    return x_out2.type_as(x)

fish_speech/models/text2semantic/lora.py CHANGED Viewed

@@ -1,92 +1,92 @@
-from dataclasses import dataclass
-import loralib as lora
-@dataclass
-class LoraConfig:
-    r: int
-    lora_alpha: float
-    lora_dropout: float = 0.0
-def setup_lora(model, lora_config):
-    # Replace the embedding layer with a LoRA layer
-    model.embeddings = lora.Embedding(
-        num_embeddings=model.embeddings.num_embeddings,
-        embedding_dim=model.embeddings.embedding_dim,
-        padding_idx=model.embeddings.padding_idx,
-        r=lora_config.r,
-        lora_alpha=lora_config.lora_alpha,
-    )
-    model.codebook_embeddings = lora.Embedding(
-        num_embeddings=model.codebook_embeddings.num_embeddings,
-        embedding_dim=model.codebook_embeddings.embedding_dim,
-        padding_idx=model.codebook_embeddings.padding_idx,
-        r=lora_config.r,
-        lora_alpha=lora_config.lora_alpha,
-    )
-    # Replace output layer with a LoRA layer
-    linears = [(model, "output")]
-    # Replace all linear layers with LoRA layers
-    for layer in model.layers:
-        linears.extend([(layer.attention, "wqkv"), (layer.attention, "wo")])
-        linears.extend(
-            [
-                (layer.feed_forward, "w1"),
-                (layer.feed_forward, "w2"),
-                (layer.feed_forward, "w3"),
-            ]
-        )
-    if hasattr(model, "fast_layers"):
-        model.fast_embeddings = lora.Embedding(
-            num_embeddings=model.fast_embeddings.num_embeddings,
-            embedding_dim=model.fast_embeddings.embedding_dim,
-            padding_idx=model.fast_embeddings.padding_idx,
-            r=lora_config.r,
-            lora_alpha=lora_config.lora_alpha,
-        )
-        # Dual-AR model
-        linears.append((model, "fast_output"))
-        for layer in model.fast_layers:
-            linears.extend([(layer.attention, "wqkv"), (layer.attention, "wo")])
-            linears.extend(
-                [
-                    (layer.feed_forward, "w1"),
-                    (layer.feed_forward, "w2"),
-                    (layer.feed_forward, "w3"),
-                ]
-            )
-    for module, layer in linears:
-        updated_linear = lora.Linear(
-            in_features=getattr(module, layer).in_features,
-            out_features=getattr(module, layer).out_features,
-            bias=getattr(module, layer).bias,
-            r=lora_config.r,
-            lora_alpha=lora_config.lora_alpha,
-            lora_dropout=lora_config.lora_dropout,
-        )
-        setattr(module, layer, updated_linear)
-    # Mark only the LoRA layers as trainable
-    lora.mark_only_lora_as_trainable(model, bias="none")
-def get_merged_state_dict(model):
-    # This line will merge the state dict of the model and the LoRA parameters
-    model.eval()
-    # Then we need to remove the LoRA parameters from the state dict
-    state_dict = model.state_dict()
-    for name in list(state_dict.keys()):
-        if "lora" in name:
-            state_dict.pop(name)
-    return state_dict

+from dataclasses import dataclass
+import loralib as lora
+@dataclass
+class LoraConfig:
+    r: int
+    lora_alpha: float
+    lora_dropout: float = 0.0
+def setup_lora(model, lora_config):
+    # Replace the embedding layer with a LoRA layer
+    model.embeddings = lora.Embedding(
+        num_embeddings=model.embeddings.num_embeddings,
+        embedding_dim=model.embeddings.embedding_dim,
+        padding_idx=model.embeddings.padding_idx,
+        r=lora_config.r,
+        lora_alpha=lora_config.lora_alpha,
+    )
+    model.codebook_embeddings = lora.Embedding(
+        num_embeddings=model.codebook_embeddings.num_embeddings,
+        embedding_dim=model.codebook_embeddings.embedding_dim,
+        padding_idx=model.codebook_embeddings.padding_idx,
+        r=lora_config.r,
+        lora_alpha=lora_config.lora_alpha,
+    )
+    # Replace output layer with a LoRA layer
+    linears = [(model, "output")]
+    # Replace all linear layers with LoRA layers
+    for layer in model.layers:
+        linears.extend([(layer.attention, "wqkv"), (layer.attention, "wo")])
+        linears.extend(
+            [
+                (layer.feed_forward, "w1"),
+                (layer.feed_forward, "w2"),
+                (layer.feed_forward, "w3"),
+            ]
+        )
+    if hasattr(model, "fast_layers"):
+        model.fast_embeddings = lora.Embedding(
+            num_embeddings=model.fast_embeddings.num_embeddings,
+            embedding_dim=model.fast_embeddings.embedding_dim,
+            padding_idx=model.fast_embeddings.padding_idx,
+            r=lora_config.r,
+            lora_alpha=lora_config.lora_alpha,
+        )
+        # Dual-AR model
+        linears.append((model, "fast_output"))
+        for layer in model.fast_layers:
+            linears.extend([(layer.attention, "wqkv"), (layer.attention, "wo")])
+            linears.extend(
+                [
+                    (layer.feed_forward, "w1"),
+                    (layer.feed_forward, "w2"),
+                    (layer.feed_forward, "w3"),
+                ]
+            )
+    for module, layer in linears:
+        updated_linear = lora.Linear(
+            in_features=getattr(module, layer).in_features,
+            out_features=getattr(module, layer).out_features,
+            bias=getattr(module, layer).bias,
+            r=lora_config.r,
+            lora_alpha=lora_config.lora_alpha,
+            lora_dropout=lora_config.lora_dropout,
+        )
+        setattr(module, layer, updated_linear)
+    # Mark only the LoRA layers as trainable
+    lora.mark_only_lora_as_trainable(model, bias="none")
+def get_merged_state_dict(model):
+    # This line will merge the state dict of the model and the LoRA parameters
+    model.eval()
+    # Then we need to remove the LoRA parameters from the state dict
+    state_dict = model.state_dict()
+    for name in list(state_dict.keys()):
+        if "lora" in name:
+            state_dict.pop(name)
+    return state_dict

fish_speech/text/__init__.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from .clean import clean_text
-from .spliter import split_text
-__all__ = ["clean_text", "split_text"]

+from .clean import clean_text
+from .spliter import split_text
+__all__ = ["clean_text", "split_text"]

fish_speech/text/clean.py CHANGED Viewed

@@ -1,37 +1,37 @@
-import re
-SYMBOLS_MAPPING = {
-    "‘": "'",
-    "’": "'",
-}
-REPLACE_SYMBOL_REGEX = re.compile(
-    "|".join(re.escape(p) for p in SYMBOLS_MAPPING.keys())
-)
-EMOJI_REGEX = re.compile(
-    "["
-    "\U0001F600-\U0001F64F"  # emoticons
-    "\U0001F300-\U0001F5FF"  # symbols & pictographs
-    "\U0001F680-\U0001F6FF"  # transport & map symbols
-    "\U0001F1E0-\U0001F1FF"  # flags (iOS)
-    "]+",
-    flags=re.UNICODE,
-)
-def clean_text(text):
-    # Clean the text
-    text = text.strip()
-    # Replace all chinese symbols with their english counterparts
-    text = REPLACE_SYMBOL_REGEX.sub(lambda x: SYMBOLS_MAPPING[x.group()], text)
-    # Remove emojis
-    text = EMOJI_REGEX.sub(r"", text)
-    # Remove continuous periods (...) and commas (,,,)
-    text = re.sub(r"[,]{2,}", lambda m: m.group()[0], text)
-    return text

+import re
+SYMBOLS_MAPPING = {
+    "‘": "'",
+    "’": "'",
+}
+REPLACE_SYMBOL_REGEX = re.compile(
+    "|".join(re.escape(p) for p in SYMBOLS_MAPPING.keys())
+)
+EMOJI_REGEX = re.compile(
+    "["
+    "\U0001f600-\U0001f64f"  # emoticons
+    "\U0001f300-\U0001f5ff"  # symbols & pictographs
+    "\U0001f680-\U0001f6ff"  # transport & map symbols
+    "\U0001f1e0-\U0001f1ff"  # flags (iOS)
+    "]+",
+    flags=re.UNICODE,
+)
+def clean_text(text):
+    # Clean the text
+    text = text.strip()
+    # Replace all chinese symbols with their english counterparts
+    text = REPLACE_SYMBOL_REGEX.sub(lambda x: SYMBOLS_MAPPING[x.group()], text)
+    # Remove emojis
+    text = EMOJI_REGEX.sub(r"", text)
+    # Remove continuous periods (...) and commas (,,,)
+    text = re.sub(r"[,]{2,}", lambda m: m.group()[0], text)
+    return text

fish_speech/text/spliter.py CHANGED Viewed

@@ -1,130 +1,130 @@
-import re
-import string
-from fish_speech.text.clean import clean_text
-def utf_8_len(text: str):
-    return len(text.encode("utf-8"))
-def break_text(texts, length, splits: set):
-    for text in texts:
-        if utf_8_len(text) <= length:
-            yield text
-            continue
-        curr = ""
-        for char in text:
-            curr += char
-            if char in splits:
-                yield curr
-                curr = ""
-        if curr:
-            yield curr
-def break_text_by_length(texts, length):
-    for text in texts:
-        if utf_8_len(text) <= length:
-            yield text
-            continue
-        curr = ""
-        for char in text:
-            curr += char
-            if utf_8_len(curr) >= length:
-                yield curr
-                curr = ""
-        if curr:
-            yield curr
-def add_cleaned(curr, segments):
-    curr = curr.strip()
-    if curr and not all(c.isspace() or c in string.punctuation for c in curr):
-        segments.append(curr)
-def protect_float(text):
-    # Turns 3.14 into <3_f_14> to prevent splitting
-    return re.sub(r"(\d+)\.(\d+)", r"<\1_f_\2>", text)
-def unprotect_float(text):
-    # Turns <3_f_14> into 3.14
-    return re.sub(r"<(\d+)_f_(\d+)>", r"\1.\2", text)
-def split_text(text, length):
-    text = clean_text(text)
-    # Break the text into pieces with following rules:
-    # 1. Split the text at ".", "!", "?" if text is NOT a float
-    # 2. If the text is longer than length, split at ","
-    # 3. If the text is still longer than length, split at " "
-    # 4. If the text is still longer than length, split at any character to length
-    texts = [text]
-    texts = map(protect_float, texts)
-    texts = break_text(texts, length, {".", "!", "?", "。", "！", "？"})
-    texts = map(unprotect_float, texts)
-    texts = break_text(texts, length, {",", "，"})
-    texts = break_text(texts, length, {" "})
-    texts = list(break_text_by_length(texts, length))
-    # Then, merge the texts into segments with length <= length
-    segments = []
-    curr = ""
-    for text in texts:
-        if utf_8_len(curr) + utf_8_len(text) <= length:
-            curr += text
-        else:
-            add_cleaned(curr, segments)
-            curr = text
-    if curr:
-        add_cleaned(curr, segments)
-    return segments
-if __name__ == "__main__":
-    # Test the split_text function
-    text = "This is a test sentence. This is another test sentence. And a third one."
-    assert split_text(text, 50) == [
-        "This is a test sentence.",
-        "This is another test sentence. And a third one.",
-    ]
-    assert split_text("a,aaaaaa3.14", 10) == ["a,", "aaaaaa3.14"]
-    assert split_text("   ", 10) == []
-    assert split_text("a", 10) == ["a"]
-    text = "This is a test sentence with only commas, and no dots, and no exclamation marks, and no question marks, and no newlines."
-    assert split_text(text, 50) == [
-        "This is a test sentence with only commas,",
-        "and no dots, and no exclamation marks,",
-        "and no question marks, and no newlines.",
-    ]
-    text = "This is a test sentence This is a test sentence This is a test sentence. This is a test sentence, This is a test sentence, This is a test sentence."
-    # First half split at " ", second half split at ","
-    assert split_text(text, 50) == [
-        "This is a test sentence This is a test sentence",
-        "This is a test sentence. This is a test sentence,",
-        "This is a test sentence, This is a test sentence.",
-    ]
-    text = "这是一段很长的中文文本,而且没有句号,也没有感叹号,也没有问号,也没有换行符。"
-    assert split_text(text, 50) == [
-        "这是一段很长的中文文本,",
-        "而且没有句号,也没有感叹号,",
-        "也没有问号,也没有换行符.",
-    ]

+import re
+import string
+from fish_speech.text.clean import clean_text
+def utf_8_len(text: str):
+    return len(text.encode("utf-8"))
+def break_text(texts, length, splits: set):
+    for text in texts:
+        if utf_8_len(text) <= length:
+            yield text
+            continue
+        curr = ""
+        for char in text:
+            curr += char
+            if char in splits:
+                yield curr
+                curr = ""
+        if curr:
+            yield curr
+def break_text_by_length(texts, length):
+    for text in texts:
+        if utf_8_len(text) <= length:
+            yield text
+            continue
+        curr = ""
+        for char in text:
+            curr += char
+            if utf_8_len(curr) >= length:
+                yield curr
+                curr = ""
+        if curr:
+            yield curr
+def add_cleaned(curr, segments):
+    curr = curr.strip()
+    if curr and not all(c.isspace() or c in string.punctuation for c in curr):
+        segments.append(curr)
+def protect_float(text):
+    # Turns 3.14 into <3_f_14> to prevent splitting
+    return re.sub(r"(\d+)\.(\d+)", r"<\1_f_\2>", text)
+def unprotect_float(text):
+    # Turns <3_f_14> into 3.14
+    return re.sub(r"<(\d+)_f_(\d+)>", r"\1.\2", text)
+def split_text(text, length):
+    text = clean_text(text)
+    # Break the text into pieces with following rules:
+    # 1. Split the text at ".", "!", "?" if text is NOT a float
+    # 2. If the text is longer than length, split at ","
+    # 3. If the text is still longer than length, split at " "
+    # 4. If the text is still longer than length, split at any character to length
+    texts = [text]
+    texts = map(protect_float, texts)
+    texts = break_text(texts, length, {".", "!", "?", "。", "！", "？"})
+    texts = map(unprotect_float, texts)
+    texts = break_text(texts, length, {",", "，"})
+    texts = break_text(texts, length, {" "})
+    texts = list(break_text_by_length(texts, length))
+    # Then, merge the texts into segments with length <= length
+    segments = []
+    curr = ""
+    for text in texts:
+        if utf_8_len(curr) + utf_8_len(text) <= length:
+            curr += text
+        else:
+            add_cleaned(curr, segments)
+            curr = text
+    if curr:
+        add_cleaned(curr, segments)
+    return segments
+if __name__ == "__main__":
+    # Test the split_text function
+    text = "This is a test sentence. This is another test sentence. And a third one."
+    assert split_text(text, 50) == [
+        "This is a test sentence.",
+        "This is another test sentence. And a third one.",
+    ]
+    assert split_text("a,aaaaaa3.14", 10) == ["a,", "aaaaaa3.14"]
+    assert split_text("   ", 10) == []
+    assert split_text("a", 10) == ["a"]
+    text = "This is a test sentence with only commas, and no dots, and no exclamation marks, and no question marks, and no newlines."
+    assert split_text(text, 50) == [
+        "This is a test sentence with only commas,",
+        "and no dots, and no exclamation marks,",
+        "and no question marks, and no newlines.",
+    ]
+    text = "This is a test sentence This is a test sentence This is a test sentence. This is a test sentence, This is a test sentence, This is a test sentence."
+    # First half split at " ", second half split at ","
+    assert split_text(text, 50) == [
+        "This is a test sentence This is a test sentence",
+        "This is a test sentence. This is a test sentence,",
+        "This is a test sentence, This is a test sentence.",
+    ]
+    text = "这是一段很长的中文文本,而且没有句号,也没有感叹号,也没有问号,也没有换行符。"
+    assert split_text(text, 50) == [
+        "这是一段很长的中文文本,",
+        "而且没有句号,也没有感叹号,",
+        "也没有问号,也没有换行符.",
+    ]

fish_speech/tokenizer.py CHANGED Viewed

@@ -1,152 +1,179 @@
-import base64
-import json
-import logging
-from pathlib import Path
-import tiktoken
-logger = logging.getLogger(__name__)
-# This is a modified version of the default pattern from GPT-4o, that better handles punctuations.
-FISH_TIKTOKEN_PATTERN = "|".join(
-    [
-        r"(?i:'s|'t|'re|'ve|'m|'ll|'d)",
-        r"\p{P}",
-        r"[^\r\n\p{L}\p{N}]?\p{L}+",
-        r"\p{N}",
-        r" ?[^\s\p{L}\p{N}]+[\r\n]*",
-        r"\s*[\r\n]+",
-        r"\s+(\?!\S)",
-        r"\s+",
-    ]
-)
-TIKTOKEN_MAX_ENCODE_CHARS = 400_000
-BOS_TOKEN = "<|begin_of_text|>"
-EOS_TOKEN = "<|end_of_text|>"
-PAD_TOKEN = "<|pad|>"
-IM_START_TOKEN = "<|im_start|>"
-IM_END_TOKEN = "<|im_end|>"
-MODALITY_TEXT_TOKEN = "<|text|>"
-MODALITY_VOICE_TOKEN = "<|voice|>"
-MODALITY_INTERLEAVE_TOKEN = "<|interleave|>"
-MODALITY_TOKENS = {
-    "text": MODALITY_TEXT_TOKEN,
-    "voice": MODALITY_VOICE_TOKEN,
-    "interleave": MODALITY_INTERLEAVE_TOKEN,
-}
-PLACEHOLDER_TOKEN = [""] * 4
-for i in range(4):
-    PLACEHOLDER_TOKEN[i] = f"<|placeholder:{i}|>"
-SEMANTIC_TOKEN_TEMPLATE = "<|semantic:{i}|>"
-SEMANTIC_TOKENS = [SEMANTIC_TOKEN_TEMPLATE.format(i=i) for i in range(1024)]
-# Warning: when you add a new special token, you should only add it to the end of the list.
-ALL_SPECIAL_TOKENS = [
-    BOS_TOKEN,
-    EOS_TOKEN,
-    PAD_TOKEN,
-    IM_START_TOKEN,
-    IM_END_TOKEN,
-    PLACEHOLDER_TOKEN[0],
-    PLACEHOLDER_TOKEN[1],
-    PLACEHOLDER_TOKEN[2],
-    PLACEHOLDER_TOKEN[3],
-    MODALITY_TEXT_TOKEN,
-    MODALITY_VOICE_TOKEN,
-    MODALITY_INTERLEAVE_TOKEN,
-    *SEMANTIC_TOKENS,
-]
-class FishTokenizer:
-    def __init__(self, model_path: str) -> None:
-        mergeable_ranks = self.load_tiktoken_bpe(model_path)
-        special_token_begin = len(mergeable_ranks)
-        self.all_special_tokens_with_ids = {
-            token: special_token_begin + i for i, token in enumerate(ALL_SPECIAL_TOKENS)
-        }
-        self.semantic_id_to_token_id = {
-            i: self.all_special_tokens_with_ids[token]
-            for i, token in enumerate(SEMANTIC_TOKENS)
-        }
-        self.semantic_begin_id = self.all_special_tokens_with_ids[SEMANTIC_TOKENS[0]]
-        self.semantic_end_id = self.all_special_tokens_with_ids[SEMANTIC_TOKENS[-1]]
-        self.tkt_model = tiktoken.core.Encoding(
-            name=Path(model_path).stem,
-            pat_str=FISH_TIKTOKEN_PATTERN,
-            mergeable_ranks=mergeable_ranks,
-            special_tokens=self.all_special_tokens_with_ids,
-        )
-    @staticmethod
-    def load_tiktoken_bpe(tiktoken_bpe_file: str) -> dict[bytes, int]:
-        data = {}
-        for line in open(tiktoken_bpe_file).read().splitlines():
-            if not line:
-                continue
-            token, rank = line.split()
-            data[base64.b64decode(token)] = int(rank)
-        return data
-    def get_token_id(self, token: str) -> int:
-        return self.all_special_tokens_with_ids[token]
-    def encode(self, s: str, allowed_special: bool | set[str] = True) -> list[int]:
-        assert isinstance(s, str)
-        subs = []
-        for i in range(0, len(s), TIKTOKEN_MAX_ENCODE_CHARS):
-            subs.append(s[i : i + TIKTOKEN_MAX_ENCODE_CHARS])
-        if allowed_special is True:
-            allowed_special = self.tkt_model.special_tokens_set
-        elif allowed_special is False:
-            allowed_special = set()
-        return sum(
-            self.tkt_model.encode_batch(
-                subs, allowed_special=allowed_special, disallowed_special=set()
-            ),
-            start=[],
-        )
-    def decode(self, tokens: list[int]) -> str:
-        return self.tkt_model.decode(tokens)
-    def save_pretrained(self, path: str):
-        path = Path(path)
-        path.mkdir(parents=True, exist_ok=True)
-        with open(path / "tokenizer.tiktoken", "w") as f:
-            for token, rank in self.tkt_model._mergeable_ranks.items():
-                f.write(f"{base64.b64encode(token).decode()} {rank}\n")
-        with open(path / "special_tokens.json", "w") as f:
-            json.dump(
-                self.all_special_tokens_with_ids,
-                f,
-                indent=2,
-                ensure_ascii=False,
-            )
-    @staticmethod
-    def from_pretrained(path: str):
-        return FishTokenizer(Path(path) / "tokenizer.tiktoken")
-if __name__ == "__main__":
-    tokenizer = FishTokenizer("data/mpacks/v1.4-pretrain/tokenizer.all.tiktoken")
-    tokenizer.save_pretrained("checkpoints/fish-speech-0.5B")
-    tokenizer = FishTokenizer.from_pretrained("checkpoints/fish-speech-0.5B")
-    print(
-        [
-            tokenizer.decode([i])
-            for i in tokenizer.encode(f"{BOS_TOKEN}你好，世界！{EOS_TOKEN}")
-        ]
-    )

+import base64
+import json
+import logging
+import re
+from pathlib import Path
+import tiktoken
+logger = logging.getLogger(__name__)
+# This is a modified version of the default pattern from GPT-4o, that better handles punctuations.
+FISH_TIKTOKEN_PATTERN = "|".join(
+    [
+        r"(?i:'s|'t|'re|'ve|'m|'ll|'d)",
+        r"\p{P}",
+        r"[^\r\n\p{L}\p{N}]?\p{L}+",
+        r"\p{N}",
+        r" ?[^\s\p{L}\p{N}]+[\r\n]*",
+        r"\s*[\r\n]+",
+        r"\s+(\?!\S)",
+        r"\s+",
+    ]
+)
+TIKTOKEN_MAX_ENCODE_CHARS = 400_000
+BOS_TOKEN = "<|begin_of_text|>"
+EOS_TOKEN = "<|end_of_text|>"
+PAD_TOKEN = "<|pad|>"
+IM_START_TOKEN = "<|im_start|>"
+IM_END_TOKEN = "<|im_end|>"
+PHONEME_START_TOKEN = "<|phoneme_start|>"
+PHONEME_END_TOKEN = "<|phoneme_end|>"
+TOOL_CALL_START_TOKEN = "<|tool_call_start|>"
+TOOL_CALL_END_TOKEN = "<|tool_call_end|>"
+MODALITY_TEXT_TOKEN = "<|text|>"
+MODALITY_VOICE_TOKEN = "<|voice|>"
+MODALITY_INTERLEAVE_TOKEN = "<|interleave|>"
+AUDIO_START_TOKEN = "<|audio_start|>"
+AUDIO_END_TOKEN = "<|audio_end|>"
+AUDIO_EMBED_TOKEN = "<|audio|>"
+MODALITY_TOKENS = {
+    "text": MODALITY_TEXT_TOKEN,
+    "voice": MODALITY_VOICE_TOKEN,
+    "interleave": MODALITY_INTERLEAVE_TOKEN,
+}
+SEMANTIC_TOKEN_TEMPLATE = "<|semantic:{i}|>"
+SEMANTIC_TOKENS = [SEMANTIC_TOKEN_TEMPLATE.format(i=i) for i in range(1024)]
+# Warning: when you add a new special token, you should only add it to the end of the list.
+ALL_SPECIAL_TOKENS = [
+    BOS_TOKEN,
+    EOS_TOKEN,
+    PAD_TOKEN,
+    IM_START_TOKEN,
+    IM_END_TOKEN,
+    PHONEME_START_TOKEN,
+    PHONEME_END_TOKEN,
+    TOOL_CALL_START_TOKEN,
+    TOOL_CALL_END_TOKEN,
+    MODALITY_TEXT_TOKEN,
+    MODALITY_VOICE_TOKEN,
+    MODALITY_INTERLEAVE_TOKEN,
+    AUDIO_START_TOKEN,
+    AUDIO_END_TOKEN,
+    AUDIO_EMBED_TOKEN,
+    *SEMANTIC_TOKENS,
+]
+class FishTokenizer:
+    def __init__(
+        self, model_path: str, special_tokens: list[str] = ALL_SPECIAL_TOKENS
+    ) -> None:
+        mergeable_ranks = self.load_tiktoken_bpe(model_path)
+        special_token_begin = len(mergeable_ranks)
+        self.all_special_tokens_with_ids = {
+            token: special_token_begin + i for i, token in enumerate(special_tokens)
+        }
+        self.semantic_id_to_token_id = {}
+        end_idx = 0
+        for token in special_tokens:
+            if token.startswith("<|semantic:"):
+                idx = int(re.match(r"<\|semantic:(\d+)\|>", token).group(1))
+                self.semantic_id_to_token_id[idx] = self.all_special_tokens_with_ids[
+                    token
+                ]
+                if idx > end_idx:
+                    end_idx = idx
+        self.semantic_begin_id = self.semantic_id_to_token_id[0]
+        self.semantic_end_id = self.semantic_id_to_token_id[end_idx]
+        self.tkt_model = tiktoken.core.Encoding(
+            name=Path(model_path).stem,
+            pat_str=FISH_TIKTOKEN_PATTERN,
+            mergeable_ranks=mergeable_ranks,
+            special_tokens=self.all_special_tokens_with_ids,
+        )
+    @property
+    def vocab_size(self):
+        return len(self.tkt_model._mergeable_ranks)
+    @property
+    def num_special_tokens(self):
+        return len(self.all_special_tokens_with_ids)
+    @staticmethod
+    def load_tiktoken_bpe(tiktoken_bpe_file: str) -> dict[bytes, int]:
+        data = {}
+        for line in open(tiktoken_bpe_file).read().splitlines():
+            if not line:
+                continue
+            token, rank = line.split()
+            if token == "=":
+                continue
+            data[base64.b64decode(token)] = int(rank)
+        return data
+    def get_token_id(self, token: str) -> int:
+        return self.all_special_tokens_with_ids[token]
+    def encode(self, s: str, allowed_special: bool | set[str] = True) -> list[int]:
+        assert isinstance(s, str)
+        subs = []
+        for i in range(0, len(s), TIKTOKEN_MAX_ENCODE_CHARS):
+            subs.append(s[i : i + TIKTOKEN_MAX_ENCODE_CHARS])
+        if allowed_special is True:
+            allowed_special = self.tkt_model.special_tokens_set
+        elif allowed_special is False:
+            allowed_special = set()
+        return sum(
+            self.tkt_model.encode_batch(
+                subs, allowed_special=allowed_special, disallowed_special=set()
+            ),
+            start=[],
+        )
+    def decode(self, tokens: list[int]) -> str:
+        return self.tkt_model.decode(tokens)
+    def save_pretrained(self, path: str):
+        path = Path(path)
+        path.mkdir(parents=True, exist_ok=True)
+        with open(path / "tokenizer.tiktoken", "w") as f:
+            for token, rank in self.tkt_model._mergeable_ranks.items():
+                a = base64.b64encode(token).decode()
+                if a == "":
+                    a = "="
+                f.write(f"{a} {rank}\n")
+        with open(path / "special_tokens.json", "w") as f:
+            json.dump(
+                self.all_special_tokens_with_ids,
+                f,
+                indent=2,
+                ensure_ascii=False,
+            )
+    @staticmethod
+    def from_pretrained(path: str):
+        special_tokens_path = Path(path) / "special_tokens.json"
+        if special_tokens_path.exists():
+            with open(special_tokens_path) as f:
+                all_special_tokens_with_ids = json.load(f)
+        else:
+            all_special_tokens_with_ids = ALL_SPECIAL_TOKENS
+        return FishTokenizer(
+            Path(path) / "tokenizer.tiktoken", all_special_tokens_with_ids
+        )

fish_speech/utils/__init__.py CHANGED Viewed

@@ -1,24 +1,24 @@
-from .braceexpand import braceexpand
-from .context import autocast_exclude_mps
-from .file import get_latest_checkpoint
-from .instantiators import instantiate_callbacks, instantiate_loggers
-from .logger import RankedLogger
-from .logging_utils import log_hyperparameters
-from .rich_utils import enforce_tags, print_config_tree
-from .utils import extras, get_metric_value, set_seed, task_wrapper
-__all__ = [
-    "enforce_tags",
-    "extras",
-    "get_metric_value",
-    "RankedLogger",
-    "instantiate_callbacks",
-    "instantiate_loggers",
-    "log_hyperparameters",
-    "print_config_tree",
-    "task_wrapper",
-    "braceexpand",
-    "get_latest_checkpoint",
-    "autocast_exclude_mps",
-    "set_seed",
-]

+from .braceexpand import braceexpand
+from .context import autocast_exclude_mps
+from .file import get_latest_checkpoint
+from .instantiators import instantiate_callbacks, instantiate_loggers
+from .logger import RankedLogger
+from .logging_utils import log_hyperparameters
+from .rich_utils import enforce_tags, print_config_tree
+from .utils import extras, get_metric_value, set_seed, task_wrapper
+__all__ = [
+    "enforce_tags",
+    "extras",
+    "get_metric_value",
+    "RankedLogger",
+    "instantiate_callbacks",
+    "instantiate_loggers",
+    "log_hyperparameters",
+    "print_config_tree",
+    "task_wrapper",
+    "braceexpand",
+    "get_latest_checkpoint",
+    "autocast_exclude_mps",
+    "set_seed",
+]

fish_speech/utils/braceexpand.py CHANGED Viewed

@@ -1,217 +1,217 @@
-"""
-Bash-style brace expansion
-Copied from: https://github.com/trendels/braceexpand/blob/main/src/braceexpand/__init__.py
-License: MIT
-"""
-import re
-import string
-from itertools import chain, product
-from typing import Iterable, Iterator, Optional
-__all__ = ["braceexpand", "alphabet", "UnbalancedBracesError"]
-class UnbalancedBracesError(ValueError):
-    pass
-alphabet = string.ascii_uppercase + string.ascii_lowercase
-int_range_re = re.compile(r"^(-?\d+)\.\.(-?\d+)(?:\.\.-?(\d+))?$")
-char_range_re = re.compile(r"^([A-Za-z])\.\.([A-Za-z])(?:\.\.-?(\d+))?$")
-escape_re = re.compile(r"\\(.)")
-def braceexpand(pattern: str, escape: bool = True) -> Iterator[str]:
-    """braceexpand(pattern) -> iterator over generated strings
-    Returns an iterator over the strings resulting from brace expansion
-    of pattern. This function implements Brace Expansion as described in
-    bash(1), with the following limitations:
-    * A pattern containing unbalanced braces will raise an
-      UnbalancedBracesError exception. In bash, unbalanced braces will either
-      be partly expanded or ignored.
-    * A mixed-case character range like '{Z..a}' or '{a..Z}' will not
-      include the characters '[]^_`' between 'Z' and 'a'.
-    When escape is True (the default), characters in pattern can be
-    prefixed with a backslash to cause them not to be interpreted as
-    special characters for brace expansion (such as '{', '}', ',').
-    To pass through a a literal backslash, double it ('\\\\').
-    When escape is False, backslashes in pattern have no special
-    meaning and will be preserved in the output.
-    Examples:
-    >>> from braceexpand import braceexpand
-    # Integer range
-    >>> list(braceexpand('item{1..3}'))
-    ['item1', 'item2', 'item3']
-    # Character range
-    >>> list(braceexpand('{a..c}'))
-    ['a', 'b', 'c']
-    # Sequence
-    >>> list(braceexpand('index.html{,.backup}'))
-    ['index.html', 'index.html.backup']
-    # Nested patterns
-    >>> list(braceexpand('python{2.{5..7},3.{2,3}}'))
-    ['python2.5', 'python2.6', 'python2.7', 'python3.2', 'python3.3']
-    # Prefixing an integer with zero causes all numbers to be padded to
-    # the same width.
-    >>> list(braceexpand('{07..10}'))
-    ['07', '08', '09', '10']
-    # An optional increment can be specified for ranges.
-    >>> list(braceexpand('{a..g..2}'))
-    ['a', 'c', 'e', 'g']
-    # Ranges can go in both directions.
-    >>> list(braceexpand('{4..1}'))
-    ['4', '3', '2', '1']
-    # Numbers can be negative
-    >>> list(braceexpand('{2..-1}'))
-    ['2', '1', '0', '-1']
-    # Unbalanced braces raise an exception.
-    >>> list(braceexpand('{1{2,3}'))
-    Traceback (most recent call last):
-        ...
-    UnbalancedBracesError: Unbalanced braces: '{1{2,3}'
-    # By default, the backslash is the escape character.
-    >>> list(braceexpand(r'{1\\{2,3}'))
-    ['1{2', '3']
-    # Setting 'escape' to False disables backslash escaping.
-    >>> list(braceexpand(r'\\{1,2}', escape=False))
-    ['\\\\1', '\\\\2']
-    """
-    return (
-        escape_re.sub(r"\1", s) if escape else s for s in parse_pattern(pattern, escape)
-    )
-def parse_pattern(pattern: str, escape: bool) -> Iterator[str]:
-    start = 0
-    pos = 0
-    bracketdepth = 0
-    items: list[Iterable[str]] = []
-    # print 'pattern:', pattern
-    while pos < len(pattern):
-        if escape and pattern[pos] == "\\":
-            pos += 2
-            continue
-        elif pattern[pos] == "{":
-            if bracketdepth == 0 and pos > start:
-                # print 'literal:', pattern[start:pos]
-                items.append([pattern[start:pos]])
-                start = pos
-            bracketdepth += 1
-        elif pattern[pos] == "}":
-            bracketdepth -= 1
-            if bracketdepth == 0:
-                # print 'expression:', pattern[start+1:pos]
-                expr = pattern[start + 1 : pos]
-                item = parse_expression(expr, escape)
-                if item is None:  # not a range or sequence
-                    items.extend([["{"], parse_pattern(expr, escape), ["}"]])
-                else:
-                    items.append(item)
-                start = pos + 1  # skip the closing brace
-        pos += 1
-    if bracketdepth != 0:  # unbalanced braces
-        raise UnbalancedBracesError("Unbalanced braces: '%s'" % pattern)
-    if start < pos:
-        items.append([pattern[start:]])
-    return ("".join(item) for item in product(*items))
-def parse_expression(expr: str, escape: bool) -> Optional[Iterable[str]]:
-    int_range_match = int_range_re.match(expr)
-    if int_range_match:
-        return make_int_range(*int_range_match.groups())
-    char_range_match = char_range_re.match(expr)
-    if char_range_match:
-        return make_char_range(*char_range_match.groups())
-    return parse_sequence(expr, escape)
-def parse_sequence(seq: str, escape: bool) -> Optional[Iterator[str]]:
-    # sequence -> chain(*sequence_items)
-    start = 0
-    pos = 0
-    bracketdepth = 0
-    items: list[Iterable[str]] = []
-    # print 'sequence:', seq
-    while pos < len(seq):
-        if escape and seq[pos] == "\\":
-            pos += 2
-            continue
-        elif seq[pos] == "{":
-            bracketdepth += 1
-        elif seq[pos] == "}":
-            bracketdepth -= 1
-        elif seq[pos] == "," and bracketdepth == 0:
-            items.append(parse_pattern(seq[start:pos], escape))
-            start = pos + 1  # skip the comma
-        pos += 1
-    if bracketdepth != 0:
-        raise UnbalancedBracesError
-    if not items:
-        return None
-    # part after the last comma (may be the empty string)
-    items.append(parse_pattern(seq[start:], escape))
-    return chain(*items)
-def make_int_range(left: str, right: str, incr: Optional[str] = None) -> Iterator[str]:
-    if any([s.startswith(("0", "-0")) for s in (left, right) if s not in ("0", "-0")]):
-        padding = max(len(left), len(right))
-    else:
-        padding = 0
-    step = (int(incr) or 1) if incr else 1
-    start = int(left)
-    end = int(right)
-    r = range(start, end + 1, step) if start < end else range(start, end - 1, -step)
-    fmt = "%0{}d".format(padding)
-    return (fmt % i for i in r)
-def make_char_range(left: str, right: str, incr: Optional[str] = None) -> str:
-    step = (int(incr) or 1) if incr else 1
-    start = alphabet.index(left)
-    end = alphabet.index(right)
-    if start < end:
-        return alphabet[start : end + 1 : step]
-    else:
-        end = end or -len(alphabet)
-        return alphabet[start : end - 1 : -step]
-if __name__ == "__main__":
-    import doctest
-    import sys
-    failed, _ = doctest.testmod(optionflags=doctest.IGNORE_EXCEPTION_DETAIL)
-    if failed:
-        sys.exit(1)

+"""
+Bash-style brace expansion
+Copied from: https://github.com/trendels/braceexpand/blob/main/src/braceexpand/__init__.py
+License: MIT
+"""
+import re
+import string
+from itertools import chain, product
+from typing import Iterable, Iterator, Optional
+__all__ = ["braceexpand", "alphabet", "UnbalancedBracesError"]
+class UnbalancedBracesError(ValueError):
+    pass
+alphabet = string.ascii_uppercase + string.ascii_lowercase
+int_range_re = re.compile(r"^(-?\d+)\.\.(-?\d+)(?:\.\.-?(\d+))?$")
+char_range_re = re.compile(r"^([A-Za-z])\.\.([A-Za-z])(?:\.\.-?(\d+))?$")
+escape_re = re.compile(r"\\(.)")
+def braceexpand(pattern: str, escape: bool = True) -> Iterator[str]:
+    """braceexpand(pattern) -> iterator over generated strings
+    Returns an iterator over the strings resulting from brace expansion
+    of pattern. This function implements Brace Expansion as described in
+    bash(1), with the following limitations:
+    * A pattern containing unbalanced braces will raise an
+      UnbalancedBracesError exception. In bash, unbalanced braces will either
+      be partly expanded or ignored.
+    * A mixed-case character range like '{Z..a}' or '{a..Z}' will not
+      include the characters '[]^_`' between 'Z' and 'a'.
+    When escape is True (the default), characters in pattern can be
+    prefixed with a backslash to cause them not to be interpreted as
+    special characters for brace expansion (such as '{', '}', ',').
+    To pass through a a literal backslash, double it ('\\\\').
+    When escape is False, backslashes in pattern have no special
+    meaning and will be preserved in the output.
+    Examples:
+    >>> from braceexpand import braceexpand
+    # Integer range
+    >>> list(braceexpand('item{1..3}'))
+    ['item1', 'item2', 'item3']
+    # Character range
+    >>> list(braceexpand('{a..c}'))
+    ['a', 'b', 'c']
+    # Sequence
+    >>> list(braceexpand('index.html{,.backup}'))
+    ['index.html', 'index.html.backup']
+    # Nested patterns
+    >>> list(braceexpand('python{2.{5..7},3.{2,3}}'))
+    ['python2.5', 'python2.6', 'python2.7', 'python3.2', 'python3.3']
+    # Prefixing an integer with zero causes all numbers to be padded to
+    # the same width.
+    >>> list(braceexpand('{07..10}'))
+    ['07', '08', '09', '10']
+    # An optional increment can be specified for ranges.
+    >>> list(braceexpand('{a..g..2}'))
+    ['a', 'c', 'e', 'g']
+    # Ranges can go in both directions.
+    >>> list(braceexpand('{4..1}'))
+    ['4', '3', '2', '1']
+    # Numbers can be negative
+    >>> list(braceexpand('{2..-1}'))
+    ['2', '1', '0', '-1']
+    # Unbalanced braces raise an exception.
+    >>> list(braceexpand('{1{2,3}'))
+    Traceback (most recent call last):
+        ...
+    UnbalancedBracesError: Unbalanced braces: '{1{2,3}'
+    # By default, the backslash is the escape character.
+    >>> list(braceexpand(r'{1\\{2,3}'))
+    ['1{2', '3']
+    # Setting 'escape' to False disables backslash escaping.
+    >>> list(braceexpand(r'\\{1,2}', escape=False))
+    ['\\\\1', '\\\\2']
+    """
+    return (
+        escape_re.sub(r"\1", s) if escape else s for s in parse_pattern(pattern, escape)
+    )
+def parse_pattern(pattern: str, escape: bool) -> Iterator[str]:
+    start = 0
+    pos = 0
+    bracketdepth = 0
+    items: list[Iterable[str]] = []
+    # print 'pattern:', pattern
+    while pos < len(pattern):
+        if escape and pattern[pos] == "\\":
+            pos += 2
+            continue
+        elif pattern[pos] == "{":
+            if bracketdepth == 0 and pos > start:
+                # print 'literal:', pattern[start:pos]
+                items.append([pattern[start:pos]])
+                start = pos
+            bracketdepth += 1
+        elif pattern[pos] == "}":
+            bracketdepth -= 1
+            if bracketdepth == 0:
+                # print 'expression:', pattern[start+1:pos]
+                expr = pattern[start + 1 : pos]
+                item = parse_expression(expr, escape)
+                if item is None:  # not a range or sequence
+                    items.extend([["{"], parse_pattern(expr, escape), ["}"]])
+                else:
+                    items.append(item)
+                start = pos + 1  # skip the closing brace
+        pos += 1
+    if bracketdepth != 0:  # unbalanced braces
+        raise UnbalancedBracesError("Unbalanced braces: '%s'" % pattern)
+    if start < pos:
+        items.append([pattern[start:]])
+    return ("".join(item) for item in product(*items))
+def parse_expression(expr: str, escape: bool) -> Optional[Iterable[str]]:
+    int_range_match = int_range_re.match(expr)
+    if int_range_match:
+        return make_int_range(*int_range_match.groups())
+    char_range_match = char_range_re.match(expr)
+    if char_range_match:
+        return make_char_range(*char_range_match.groups())
+    return parse_sequence(expr, escape)
+def parse_sequence(seq: str, escape: bool) -> Optional[Iterator[str]]:
+    # sequence -> chain(*sequence_items)
+    start = 0
+    pos = 0
+    bracketdepth = 0
+    items: list[Iterable[str]] = []
+    # print 'sequence:', seq
+    while pos < len(seq):
+        if escape and seq[pos] == "\\":
+            pos += 2
+            continue
+        elif seq[pos] == "{":
+            bracketdepth += 1
+        elif seq[pos] == "}":
+            bracketdepth -= 1
+        elif seq[pos] == "," and bracketdepth == 0:
+            items.append(parse_pattern(seq[start:pos], escape))
+            start = pos + 1  # skip the comma
+        pos += 1
+    if bracketdepth != 0:
+        raise UnbalancedBracesError
+    if not items:
+        return None
+    # part after the last comma (may be the empty string)
+    items.append(parse_pattern(seq[start:], escape))
+    return chain(*items)
+def make_int_range(left: str, right: str, incr: Optional[str] = None) -> Iterator[str]:
+    if any([s.startswith(("0", "-0")) for s in (left, right) if s not in ("0", "-0")]):
+        padding = max(len(left), len(right))
+    else:
+        padding = 0
+    step = (int(incr) or 1) if incr else 1
+    start = int(left)
+    end = int(right)
+    r = range(start, end + 1, step) if start < end else range(start, end - 1, -step)
+    fmt = "%0{}d".format(padding)
+    return (fmt % i for i in r)
+def make_char_range(left: str, right: str, incr: Optional[str] = None) -> str:
+    step = (int(incr) or 1) if incr else 1
+    start = alphabet.index(left)
+    end = alphabet.index(right)
+    if start < end:
+        return alphabet[start : end + 1 : step]
+    else:
+        end = end or -len(alphabet)
+        return alphabet[start : end - 1 : -step]
+if __name__ == "__main__":
+    import doctest
+    import sys
+    failed, _ = doctest.testmod(optionflags=doctest.IGNORE_EXCEPTION_DETAIL)
+    if failed:
+        sys.exit(1)

fish_speech/utils/context.py CHANGED Viewed

@@ -1,13 +1,13 @@
-from contextlib import nullcontext
-import torch
-def autocast_exclude_mps(
-    device_type: str, dtype: torch.dtype
-) -> nullcontext | torch.autocast:
-    return (
-        nullcontext()
-        if torch.backends.mps.is_available()
-        else torch.autocast(device_type, dtype)
-    )

+from contextlib import nullcontext
+import torch
+def autocast_exclude_mps(
+    device_type: str, dtype: torch.dtype
+) -> nullcontext | torch.autocast:
+    return (
+        nullcontext()
+        if torch.backends.mps.is_available()
+        else torch.autocast(device_type, dtype)
+    )

fish_speech/utils/file.py CHANGED Viewed

@@ -1,16 +1,139 @@
-import os
-from pathlib import Path
-def get_latest_checkpoint(path: Path | str) -> Path | None:
-    # Find the latest checkpoint
-    ckpt_dir = Path(path)
-    if ckpt_dir.exists() is False:
-        return None
-    ckpts = sorted(ckpt_dir.glob("*.ckpt"), key=os.path.getmtime)
-    if len(ckpts) == 0:
-        return None
-    return ckpts[-1]

+import os
+from pathlib import Path
+from typing import Union
+from loguru import logger
+from natsort import natsorted
+AUDIO_EXTENSIONS = {
+    ".mp3",
+    ".wav",
+    ".flac",
+    ".ogg",
+    ".m4a",
+    ".wma",
+    ".aac",
+    ".aiff",
+    ".aif",
+    ".aifc",
+}
+VIDEO_EXTENSIONS = {
+    ".mp4",
+    ".avi",
+}
+def get_latest_checkpoint(path: Path | str) -> Path | None:
+    # Find the latest checkpoint
+    ckpt_dir = Path(path)
+    if ckpt_dir.exists() is False:
+        return None
+    ckpts = sorted(ckpt_dir.glob("*.ckpt"), key=os.path.getmtime)
+    if len(ckpts) == 0:
+        return None
+    return ckpts[-1]
+def audio_to_bytes(file_path):
+    if not file_path or not Path(file_path).exists():
+        return None
+    with open(file_path, "rb") as wav_file:
+        wav = wav_file.read()
+    return wav
+def read_ref_text(ref_text):
+    path = Path(ref_text)
+    if path.exists() and path.is_file():
+        with path.open("r", encoding="utf-8") as file:
+            return file.read()
+    return ref_text
+def list_files(
+    path: Union[Path, str],
+    extensions: set[str] = set(),
+    recursive: bool = False,
+    sort: bool = True,
+) -> list[Path]:
+    """List files in a directory.
+    Args:
+        path (Path): Path to the directory.
+        extensions (set, optional): Extensions to filter. Defaults to None.
+        recursive (bool, optional): Whether to search recursively. Defaults to False.
+        sort (bool, optional): Whether to sort the files. Defaults to True.
+    Returns:
+        list: List of files.
+    """
+    if isinstance(path, str):
+        path = Path(path)
+    if not path.exists():
+        raise FileNotFoundError(f"Directory {path} does not exist.")
+    files = [file for ext in extensions for file in path.rglob(f"*{ext}")]
+    if sort:
+        files = natsorted(files)
+    return files
+def load_filelist(path: Path | str) -> list[tuple[Path, str, str, str]]:
+    """
+    Load a Bert-VITS2 style filelist.
+    """
+    files = set()
+    results = []
+    count_duplicated, count_not_found = 0, 0
+    LANGUAGE_TO_LANGUAGES = {
+        "zh": ["zh", "en"],
+        "jp": ["jp", "en"],
+        "en": ["en"],
+    }
+    with open(path, "r", encoding="utf-8") as f:
+        for line in f.readlines():
+            splits = line.strip().split("|", maxsplit=3)
+            if len(splits) != 4:
+                logger.warning(f"Invalid line: {line}")
+                continue
+            filename, speaker, language, text = splits
+            file = Path(filename)
+            language = language.strip().lower()
+            if language == "ja":
+                language = "jp"
+            assert language in ["zh", "jp", "en"], f"Invalid language {language}"
+            languages = LANGUAGE_TO_LANGUAGES[language]
+            if file in files:
+                logger.warning(f"Duplicated file: {file}")
+                count_duplicated += 1
+                continue
+            if not file.exists():
+                logger.warning(f"File not found: {file}")
+                count_not_found += 1
+                continue
+            results.append((file, speaker, languages, text))
+    if count_duplicated > 0:
+        logger.warning(f"Total duplicated files: {count_duplicated}")
+    if count_not_found > 0:
+        logger.warning(f"Total files not found: {count_not_found}")
+    return results

fish_speech/utils/instantiators.py CHANGED Viewed

@@ -1,50 +1,50 @@
-from typing import List
-import hydra
-from omegaconf import DictConfig
-from pytorch_lightning import Callback
-from pytorch_lightning.loggers import Logger
-from .logger import RankedLogger
-log = RankedLogger(__name__, rank_zero_only=True)
-def instantiate_callbacks(callbacks_cfg: DictConfig) -> List[Callback]:
-    """Instantiates callbacks from config."""
-    callbacks: List[Callback] = []
-    if not callbacks_cfg:
-        log.warning("No callback configs found! Skipping..")
-        return callbacks
-    if not isinstance(callbacks_cfg, DictConfig):
-        raise TypeError("Callbacks config must be a DictConfig!")
-    for _, cb_conf in callbacks_cfg.items():
-        if isinstance(cb_conf, DictConfig) and "_target_" in cb_conf:
-            log.info(f"Instantiating callback <{cb_conf._target_}>")
-            callbacks.append(hydra.utils.instantiate(cb_conf))
-    return callbacks
-def instantiate_loggers(logger_cfg: DictConfig) -> List[Logger]:
-    """Instantiates loggers from config."""
-    logger: List[Logger] = []
-    if not logger_cfg:
-        log.warning("No logger configs found! Skipping...")
-        return logger
-    if not isinstance(logger_cfg, DictConfig):
-        raise TypeError("Logger config must be a DictConfig!")
-    for _, lg_conf in logger_cfg.items():
-        if isinstance(lg_conf, DictConfig) and "_target_" in lg_conf:
-            log.info(f"Instantiating logger <{lg_conf._target_}>")
-            logger.append(hydra.utils.instantiate(lg_conf))
-    return logger

+from typing import List
+import hydra
+from omegaconf import DictConfig
+from pytorch_lightning import Callback
+from pytorch_lightning.loggers import Logger
+from .logger import RankedLogger
+log = RankedLogger(__name__, rank_zero_only=True)
+def instantiate_callbacks(callbacks_cfg: DictConfig) -> List[Callback]:
+    """Instantiates callbacks from config."""
+    callbacks: List[Callback] = []
+    if not callbacks_cfg:
+        log.warning("No callback configs found! Skipping..")
+        return callbacks
+    if not isinstance(callbacks_cfg, DictConfig):
+        raise TypeError("Callbacks config must be a DictConfig!")
+    for _, cb_conf in callbacks_cfg.items():
+        if isinstance(cb_conf, DictConfig) and "_target_" in cb_conf:
+            log.info(f"Instantiating callback <{cb_conf._target_}>")
+            callbacks.append(hydra.utils.instantiate(cb_conf))
+    return callbacks
+def instantiate_loggers(logger_cfg: DictConfig) -> List[Logger]:
+    """Instantiates loggers from config."""
+    logger: List[Logger] = []
+    if not logger_cfg:
+        log.warning("No logger configs found! Skipping...")
+        return logger
+    if not isinstance(logger_cfg, DictConfig):
+        raise TypeError("Logger config must be a DictConfig!")
+    for _, lg_conf in logger_cfg.items():
+        if isinstance(lg_conf, DictConfig) and "_target_" in lg_conf:
+            log.info(f"Instantiating logger <{lg_conf._target_}>")
+            logger.append(hydra.utils.instantiate(lg_conf))
+    return logger

fish_speech/utils/logger.py CHANGED Viewed

@@ -1,55 +1,55 @@
-import logging
-from typing import Mapping, Optional
-from lightning_utilities.core.rank_zero import rank_prefixed_message, rank_zero_only
-class RankedLogger(logging.LoggerAdapter):
-    """A multi-GPU-friendly python command line logger."""
-    def __init__(
-        self,
-        name: str = __name__,
-        rank_zero_only: bool = True,
-        extra: Optional[Mapping[str, object]] = None,
-    ) -> None:
-        """Initializes a multi-GPU-friendly python command line logger that logs on all processes
-        with their rank prefixed in the log message.
-        :param name: The name of the logger. Default is ``__name__``.
-        :param rank_zero_only: Whether to force all logs to only occur on the rank zero process. Default is `False`.
-        :param extra: (Optional) A dict-like object which provides contextual information. See `logging.LoggerAdapter`.
-        """
-        logger = logging.getLogger(name)
-        super().__init__(logger=logger, extra=extra)
-        self.rank_zero_only = rank_zero_only
-    def log(
-        self, level: int, msg: str, rank: Optional[int] = None, *args, **kwargs
-    ) -> None:
-        """Delegate a log call to the underlying logger, after prefixing its message with the rank
-        of the process it's being logged from. If `'rank'` is provided, then the log will only
-        occur on that rank/process.
-        :param level: The level to log at. Look at `logging.__init__.py` for more information.
-        :param msg: The message to log.
-        :param rank: The rank to log at.
-        :param args: Additional args to pass to the underlying logging function.
-        :param kwargs: Any additional keyword args to pass to the underlying logging function.
-        """
-        if self.isEnabledFor(level):
-            msg, kwargs = self.process(msg, kwargs)
-            current_rank = getattr(rank_zero_only, "rank", None)
-            if current_rank is None:
-                raise RuntimeError(
-                    "The `rank_zero_only.rank` needs to be set before use"
-                )
-            msg = rank_prefixed_message(msg, current_rank)
-            if self.rank_zero_only:
-                if current_rank == 0:
-                    self.logger.log(level, msg, *args, **kwargs)
-            else:
-                if rank is None:
-                    self.logger.log(level, msg, *args, **kwargs)
-                elif current_rank == rank:
-                    self.logger.log(level, msg, *args, **kwargs)

+import logging
+from typing import Mapping, Optional
+from lightning_utilities.core.rank_zero import rank_prefixed_message, rank_zero_only
+class RankedLogger(logging.LoggerAdapter):
+    """A multi-GPU-friendly python command line logger."""
+    def __init__(
+        self,
+        name: str = __name__,
+        rank_zero_only: bool = True,
+        extra: Optional[Mapping[str, object]] = None,
+    ) -> None:
+        """Initializes a multi-GPU-friendly python command line logger that logs on all processes
+        with their rank prefixed in the log message.
+        :param name: The name of the logger. Default is ``__name__``.
+        :param rank_zero_only: Whether to force all logs to only occur on the rank zero process. Default is `False`.
+        :param extra: (Optional) A dict-like object which provides contextual information. See `logging.LoggerAdapter`.
+        """
+        logger = logging.getLogger(name)
+        super().__init__(logger=logger, extra=extra)
+        self.rank_zero_only = rank_zero_only
+    def log(
+        self, level: int, msg: str, rank: Optional[int] = None, *args, **kwargs
+    ) -> None:
+        """Delegate a log call to the underlying logger, after prefixing its message with the rank
+        of the process it's being logged from. If `'rank'` is provided, then the log will only
+        occur on that rank/process.
+        :param level: The level to log at. Look at `logging.__init__.py` for more information.
+        :param msg: The message to log.
+        :param rank: The rank to log at.
+        :param args: Additional args to pass to the underlying logging function.
+        :param kwargs: Any additional keyword args to pass to the underlying logging function.
+        """
+        if self.isEnabledFor(level):
+            msg, kwargs = self.process(msg, kwargs)
+            current_rank = getattr(rank_zero_only, "rank", None)
+            if current_rank is None:
+                raise RuntimeError(
+                    "The `rank_zero_only.rank` needs to be set before use"
+                )
+            msg = rank_prefixed_message(msg, current_rank)
+            if self.rank_zero_only:
+                if current_rank == 0:
+                    self.logger.log(level, msg, *args, **kwargs)
+            else:
+                if rank is None:
+                    self.logger.log(level, msg, *args, **kwargs)
+                elif current_rank == rank:
+                    self.logger.log(level, msg, *args, **kwargs)

fish_speech/utils/logging_utils.py CHANGED Viewed

@@ -1,48 +1,48 @@
-from lightning.pytorch.utilities import rank_zero_only
-from fish_speech.utils import logger as log
-@rank_zero_only
-def log_hyperparameters(object_dict: dict) -> None:
-    """Controls which config parts are saved by lightning loggers.
-    Additionally saves:
-    - Number of model parameters
-    """
-    hparams = {}
-    cfg = object_dict["cfg"]
-    model = object_dict["model"]
-    trainer = object_dict["trainer"]
-    if not trainer.logger:
-        log.warning("Logger not found! Skipping hyperparameter logging...")
-        return
-    hparams["model"] = cfg["model"]
-    # save number of model parameters
-    hparams["model/params/total"] = sum(p.numel() for p in model.parameters())
-    hparams["model/params/trainable"] = sum(
-        p.numel() for p in model.parameters() if p.requires_grad
-    )
-    hparams["model/params/non_trainable"] = sum(
-        p.numel() for p in model.parameters() if not p.requires_grad
-    )
-    hparams["data"] = cfg["data"]
-    hparams["trainer"] = cfg["trainer"]
-    hparams["callbacks"] = cfg.get("callbacks")
-    hparams["extras"] = cfg.get("extras")
-    hparams["task_name"] = cfg.get("task_name")
-    hparams["tags"] = cfg.get("tags")
-    hparams["ckpt_path"] = cfg.get("ckpt_path")
-    hparams["seed"] = cfg.get("seed")
-    # send hparams to all loggers
-    for logger in trainer.loggers:
-        logger.log_hyperparams(hparams)

+from lightning.pytorch.utilities import rank_zero_only
+from fish_speech.utils import logger as log
+@rank_zero_only
+def log_hyperparameters(object_dict: dict) -> None:
+    """Controls which config parts are saved by lightning loggers.
+    Additionally saves:
+    - Number of model parameters
+    """
+    hparams = {}
+    cfg = object_dict["cfg"]
+    model = object_dict["model"]
+    trainer = object_dict["trainer"]
+    if not trainer.logger:
+        log.warning("Logger not found! Skipping hyperparameter logging...")
+        return
+    hparams["model"] = cfg["model"]
+    # save number of model parameters
+    hparams["model/params/total"] = sum(p.numel() for p in model.parameters())
+    hparams["model/params/trainable"] = sum(
+        p.numel() for p in model.parameters() if p.requires_grad
+    )
+    hparams["model/params/non_trainable"] = sum(
+        p.numel() for p in model.parameters() if not p.requires_grad
+    )
+    hparams["data"] = cfg["data"]
+    hparams["trainer"] = cfg["trainer"]
+    hparams["callbacks"] = cfg.get("callbacks")
+    hparams["extras"] = cfg.get("extras")
+    hparams["task_name"] = cfg.get("task_name")
+    hparams["tags"] = cfg.get("tags")
+    hparams["ckpt_path"] = cfg.get("ckpt_path")
+    hparams["seed"] = cfg.get("seed")
+    # send hparams to all loggers
+    for logger in trainer.loggers:
+        logger.log_hyperparams(hparams)

fish_speech/utils/rich_utils.py CHANGED Viewed

@@ -1,100 +1,100 @@
-from pathlib import Path
-from typing import Sequence
-import rich
-import rich.syntax
-import rich.tree
-from hydra.core.hydra_config import HydraConfig
-from lightning.pytorch.utilities import rank_zero_only
-from omegaconf import DictConfig, OmegaConf, open_dict
-from rich.prompt import Prompt
-from fish_speech.utils import logger as log
-@rank_zero_only
-def print_config_tree(
-    cfg: DictConfig,
-    print_order: Sequence[str] = (
-        "data",
-        "model",
-        "callbacks",
-        "logger",
-        "trainer",
-        "paths",
-        "extras",
-    ),
-    resolve: bool = False,
-    save_to_file: bool = False,
-) -> None:
-    """Prints content of DictConfig using Rich library and its tree structure.
-    Args:
-        cfg (DictConfig): Configuration composed by Hydra.
-        print_order (Sequence[str], optional): Determines in what order config components are printed.
-        resolve (bool, optional): Whether to resolve reference fields of DictConfig.
-        save_to_file (bool, optional): Whether to export config to the hydra output folder.
-    """  # noqa: E501
-    style = "dim"
-    tree = rich.tree.Tree("CONFIG", style=style, guide_style=style)
-    queue = []
-    # add fields from `print_order` to queue
-    for field in print_order:
-        (
-            queue.append(field)
-            if field in cfg
-            else log.warning(
-                f"Field '{field}' not found in config. "
-                + f"Skipping '{field}' config printing..."
-            )
-        )
-    # add all the other fields to queue (not specified in `print_order`)
-    for field in cfg:
-        if field not in queue:
-            queue.append(field)
-    # generate config tree from queue
-    for field in queue:
-        branch = tree.add(field, style=style, guide_style=style)
-        config_group = cfg[field]
-        if isinstance(config_group, DictConfig):
-            branch_content = OmegaConf.to_yaml(config_group, resolve=resolve)
-        else:
-            branch_content = str(config_group)
-        branch.add(rich.syntax.Syntax(branch_content, "yaml"))
-    # print config tree
-    rich.print(tree)
-    # save config tree to file
-    if save_to_file:
-        with open(Path(cfg.paths.output_dir, "config_tree.log"), "w") as file:
-            rich.print(tree, file=file)
-@rank_zero_only
-def enforce_tags(cfg: DictConfig, save_to_file: bool = False) -> None:
-    """Prompts user to input tags from command line if no tags are provided in config."""  # noqa: E501
-    if not cfg.get("tags"):
-        if "id" in HydraConfig().cfg.hydra.job:
-            raise ValueError("Specify tags before launching a multirun!")
-        log.warning("No tags provided in config. Prompting user to input tags...")
-        tags = Prompt.ask("Enter a list of comma separated tags", default="dev")
-        tags = [t.strip() for t in tags.split(",") if t != ""]
-        with open_dict(cfg):
-            cfg.tags = tags
-        log.info(f"Tags: {cfg.tags}")
-    if save_to_file:
-        with open(Path(cfg.paths.output_dir, "tags.log"), "w") as file:
-            rich.print(cfg.tags, file=file)

+from pathlib import Path
+from typing import Sequence
+import rich
+import rich.syntax
+import rich.tree
+from hydra.core.hydra_config import HydraConfig
+from lightning.pytorch.utilities import rank_zero_only
+from omegaconf import DictConfig, OmegaConf, open_dict
+from rich.prompt import Prompt
+from fish_speech.utils import logger as log
+@rank_zero_only
+def print_config_tree(
+    cfg: DictConfig,
+    print_order: Sequence[str] = (
+        "data",
+        "model",
+        "callbacks",
+        "logger",
+        "trainer",
+        "paths",
+        "extras",
+    ),
+    resolve: bool = False,
+    save_to_file: bool = False,
+) -> None:
+    """Prints content of DictConfig using Rich library and its tree structure.
+    Args:
+        cfg (DictConfig): Configuration composed by Hydra.
+        print_order (Sequence[str], optional): Determines in what order config components are printed.
+        resolve (bool, optional): Whether to resolve reference fields of DictConfig.
+        save_to_file (bool, optional): Whether to export config to the hydra output folder.
+    """  # noqa: E501
+    style = "dim"
+    tree = rich.tree.Tree("CONFIG", style=style, guide_style=style)
+    queue = []
+    # add fields from `print_order` to queue
+    for field in print_order:
+        (
+            queue.append(field)
+            if field in cfg
+            else log.warning(
+                f"Field '{field}' not found in config. "
+                + f"Skipping '{field}' config printing..."
+            )
+        )
+    # add all the other fields to queue (not specified in `print_order`)
+    for field in cfg:
+        if field not in queue:
+            queue.append(field)
+    # generate config tree from queue
+    for field in queue:
+        branch = tree.add(field, style=style, guide_style=style)
+        config_group = cfg[field]
+        if isinstance(config_group, DictConfig):
+            branch_content = OmegaConf.to_yaml(config_group, resolve=resolve)
+        else:
+            branch_content = str(config_group)
+        branch.add(rich.syntax.Syntax(branch_content, "yaml"))
+    # print config tree
+    rich.print(tree)
+    # save config tree to file
+    if save_to_file:
+        with open(Path(cfg.paths.output_dir, "config_tree.log"), "w") as file:
+            rich.print(tree, file=file)
+@rank_zero_only
+def enforce_tags(cfg: DictConfig, save_to_file: bool = False) -> None:
+    """Prompts user to input tags from command line if no tags are provided in config."""  # noqa: E501
+    if not cfg.get("tags"):
+        if "id" in HydraConfig().cfg.hydra.job:
+            raise ValueError("Specify tags before launching a multirun!")
+        log.warning("No tags provided in config. Prompting user to input tags...")
+        tags = Prompt.ask("Enter a list of comma separated tags", default="dev")
+        tags = [t.strip() for t in tags.split(",") if t != ""]
+        with open_dict(cfg):
+            cfg.tags = tags
+        log.info(f"Tags: {cfg.tags}")
+    if save_to_file:
+        with open(Path(cfg.paths.output_dir, "tags.log"), "w") as file:
+            rich.print(cfg.tags, file=file)