Spaces:

nineninesix
/

KaniTTS

Running on Zero

Den Pavloff commited on 17 days ago

Commit

eb18e14

1 Parent(s): 2759e04

multispeaker, multilang

Browse files

Files changed (6) hide show

app.py +60 -64
create_env.py +21 -0
examples.yaml +98 -0
model_config.yaml +36 -0
requirements.txt +2 -1
util.py +178 -17

app.py CHANGED Viewed

@@ -1,65 +1,32 @@
-import os
-import subprocess
-import sys
-# Fix OMP_NUM_THREADS issue before any imports
-os.environ["OMP_NUM_THREADS"] = "4"
-# Install dependencies programmatically to avoid conflicts
-def setup_dependencies():
-    try:
-        # Check if already installed
-        if os.path.exists('/tmp/deps_installed'):
-            return
-        print("Installing transformers dev version...")
-        subprocess.check_call([
-            sys.executable, "-m", "pip", "install", "--force-reinstall", "--no-cache-dir",
-            "git+https://github.com/huggingface/transformers.git"
-        ])
-        # Mark as installed
-        with open('/tmp/deps_installed', 'w') as f:
-            f.write('done')
-    except Exception as e:
-        print(f"Dependencies setup error: {e}")
-# Run setup
 setup_dependencies()
 import spaces
 import gradio as gr
-from util import Config, NemoAudioPlayer, KaniModel
 import numpy as np
 import torch
 # Get HuggingFace token
 token_ = os.getenv('HF_TOKEN')
-# Model configurations
-models_configs = {
-    'base': Config(),
-    'female': Config(
-        model_name='nineninesix/kani-tts-450m-0.2-ft',
-    ),
-    'male': Config(
-        model_name='nineninesix/kani-tts-450m-0.1-ft',
-    )
-}
-# Global variables for models (loaded once)
-player = NemoAudioPlayer(Config())
-models = {}
-for model_name, config in models_configs.items():
-    print(f"Loading {model_name}...")
-    models[model_name] = KaniModel(config, player, token_)
-    print(f"{model_name} loaded!")
-print("All models loaded!")
 @spaces.GPU
-def generate_speech_gpu(text, model_choice, t, top_p, rp, max_tok):
     """
     Generate speech from text using the selected model on GPU
     """
@@ -71,16 +38,19 @@ def generate_speech_gpu(text, model_choice, t, top_p, rp, max_tok):
         return None, "Please select a model."
     try:
-        # Check GPU availability
         device = "cuda" if torch.cuda.is_available() else "cpu"
         print(f"Using device: {device}")
-        # Get selected model
         selected_model = models[model_choice]
-        # Generate audio
         print(f"Generating speech with {model_choice}...")
-        audio, _, time_report = selected_model.run_model(text, t, top_p, rp, max_tok)
         sample_rate = 22050
         print("Speech generation completed!")
@@ -104,6 +74,20 @@ with gr.Blocks(title="😻 KaniTTS - Text to Speech", theme=gr.themes.Default())
                 label="Selected Model",
                 info="Base generates random voices"
             )
             text_input = gr.Textbox(
                 label="Text",
@@ -146,30 +130,42 @@ with gr.Blocks(title="😻 KaniTTS - Text to Speech", theme=gr.themes.Default())
                 lines=3
             )
     # GPU generation event
     generate_btn.click(
         fn=generate_speech_gpu,
-        inputs=[text_input, model_dropdown, temp, top_p, rp, max_tok],
         outputs=[audio_output, time_report_output]
     )
     with gr.Row():
-        examples = [
-            ["Anyway, um, so, um, tell me, tell me all about her. I mean, what's she like? Is she really, you know, pretty?", "male", 1.4, 0.95, 1.1, 1200],
-            ["No, that does not make you a failure. No, sweetie, no. It just, uh, it just means that you're having a tough time...", "male", 1.4, 0.95, 1.1, 1200],
-            ["I-- Oh, I am such an idiot sometimes. I'm so sorry. Um, I-I don't know where my head's at.", "male", 1.4, 0.95, 1.1, 1200],
-            ["Got it. $300,000. I can definitely help you get a very good price for your property by selecting a realtor.", "female", 1.4, 0.95, 1.1, 1200],
-            ["Holy fu- Oh my God! Don't you understand how dangerous it is?", "male", 1.4, 0.95, 1.1, 1200],
-            ["You make my days brighter, and my wildest dreams feel like reality. How do you do that?", "female", 1.4, 0.95, 1.1, 1200],
-            ["Great, and just a couple quick questions so we can match you with the right buyer. Is your home address still 330 East Charleston Road?", "base", 1.4, 0.95, 1.1, 1200],
-            ["Oh, yeah. I mean did you want to get a quick snack together or maybe something before you go?", "female", 1.4, 0.95, 1.1, 1200],
-        ]
         gr.Examples(
             examples=examples,
-            inputs=[text_input, model_dropdown, temp, top_p, rp, max_tok],
             fn=generate_speech_gpu,
             outputs=[audio_output, time_report_output],
             cache_examples=True,

+from create_env import setup_dependencies
 setup_dependencies()
 import spaces
 import gradio as gr
+from util import NemoAudioPlayer, InitModels, load_config, Examples
 import numpy as np
 import torch
+import os
 # Get HuggingFace token
 token_ = os.getenv('HF_TOKEN')
+config = load_config("./model_config.yaml")
+models_configs = config.models
+nemo_player_cfg = config.nemo_player
+examples_cfg = load_config("./examples.yaml")
+examples_maker = Examples(examples_cfg)
+examples = examples_maker()
+player = NemoAudioPlayer(nemo_player_cfg)
+init_models = InitModels(models_configs, player, token_)
+models = init_models()
 @spaces.GPU
+def generate_speech_gpu(text, model_choice, speaker_display: str, t, top_p, rp, max_tok):
     """
     Generate speech from text using the selected model on GPU
     """
         return None, "Please select a model."
     try:
         device = "cuda" if torch.cuda.is_available() else "cpu"
         print(f"Using device: {device}")
         selected_model = models[model_choice]
+        cfg = models_configs.get(model_choice)
+        speaker_map = cfg.get('speaker_id', {}) if cfg is not None else {}
+        if speaker_display and speaker_map:
+            speaker_id = speaker_map.get(speaker_display)
+        else:
+            speaker_id = None
         print(f"Generating speech with {model_choice}...")
+        audio, _, time_report = selected_model.run_model(text, speaker_id, t, top_p, rp, max_tok)
         sample_rate = 22050
         print("Speech generation completed!")
                 label="Selected Model",
                 info="Base generates random voices"
             )
+            # Speaker selector (shown only if model has speakers)
+            # Pre-populate all available speakers for example table rendering
+            all_speakers = []
+            for _cfg in models_configs.values():
+                if _cfg and _cfg.get('speaker_id'):
+                    all_speakers.extend(list(_cfg.speaker_id.keys()))
+            all_speakers = sorted(list(set(all_speakers)))
+            speaker_dropdown = gr.Dropdown(
+                choices=all_speakers,
+                value=None,
+                label="Speaker",
+                visible=False,
+                allow_custom_value=True
+            )
             text_input = gr.Textbox(
                 label="Text",
                 lines=3
             )
+    # Update speakers when model changes
+    def update_speakers(model_choice):
+        cfg = models_configs.get(model_choice)
+        speakers = list(cfg.speaker_id.keys()) if (cfg and cfg.get('speaker_id')) else []
+        if speakers:
+            return gr.update(choices=speakers, value=speakers[0], visible=True)
+        else:
+            return gr.update(choices=[], value=None, visible=False)
+    model_dropdown.change(
+        fn=update_speakers,
+        inputs=[model_dropdown],
+        outputs=[speaker_dropdown]
+    )
+    # Populate speakers on initial page load based on default model
+    demo.load(
+        fn=update_speakers,
+        inputs=[model_dropdown],
+        outputs=[speaker_dropdown]
+    )
     # GPU generation event
     generate_btn.click(
         fn=generate_speech_gpu,
+        inputs=[text_input, model_dropdown, speaker_dropdown, temp, top_p, rp, max_tok],
         outputs=[audio_output, time_report_output]
     )
     with gr.Row():
+        examples = examples
         gr.Examples(
             examples=examples,
+            inputs=[text_input, model_dropdown, speaker_dropdown, temp, top_p, rp, max_tok],
             fn=generate_speech_gpu,
             outputs=[audio_output, time_report_output],
             cache_examples=True,

create_env.py ADDED Viewed

	@@ -0,0 +1,21 @@

+import os
+import subprocess
+import sys
+def setup_dependencies():
+    os.environ["OMP_NUM_THREADS"] = "4"
+    try:
+        if os.path.exists('/tmp/deps_installed'):
+            return
+        print("Installing transformers dev version...")
+        subprocess.check_call([
+            sys.executable, "-m", "pip", "install", "--force-reinstall", "--no-cache-dir",
+            "git+https://github.com/huggingface/transformers.git"
+        ])
+        with open('/tmp/deps_installed', 'w') as f:
+            f.write('done')
+    except Exception as e:
+        print(f"Dependencies setup error: {e}")

examples.yaml ADDED Viewed

	@@ -0,0 +1,98 @@

+examples:
+  - text: >-
+      Anyway, um, so, um, tell me, tell me all about her. I mean, what's she like? Is she really, you know, pretty?
+    speaker_id: "Puck (EN Gemini)"
+    model: "KaniTTS"
+    temperature: 1.4
+    top_p: 0.95
+    repetition_penalty: 1.1
+    max_len: 1200
+  - text: >-
+      No, that does not make you a failure. No, sweetie, no. It just, uh, it just means that you're having a tough time...
+    speaker_id: "Kore (EN Gemini)"
+    model: "KaniTTS"
+    temperature: 1.4
+    top_p: 0.95
+    repetition_penalty: 1.1
+    max_len: 1200
+  - text: >-
+      Holy fu* Oh my God! Don't you understand how dangerous it is, huh?
+    speaker_id: "Andrew (EN)"
+    model: "KaniTTS"
+    temperature: 1.4
+    top_p: 0.95
+    repetition_penalty: 1.1
+    max_len: 1200
+  - text: >-
+      Got it. $300,000. I can definitely help you get a very good price for your property by selecting a realtor.
+    speaker_id: "David (EN British)"
+    model: "KaniTTS"
+    temperature: 1.4
+    top_p: 0.95
+    repetition_penalty: 1.1
+    max_len: 1200
+  - text: >-
+      I-- Oh, I am such an idiot sometimes. I'm so sorry. Um, I-I don't know where my head's at.
+    speaker_id: "Jenny (EN Irish)"
+    model: "KaniTTS"
+    temperature: 1.4
+    top_p: 0.95
+    repetition_penalty: 1.1
+    max_len: 1200
+  - text: >-
+      Der Dompfaff ist ein kleiner Fink, der im Winter oft in Gärten zu sehen ist.
+    speaker_id: "Thorsten (DE Hessisch)"
+    model: "KaniTTS"
+    temperature: 1.4
+    top_p: 0.95
+    repetition_penalty: 1.1
+    max_len: 1200
+  - text: >-
+      하얀 눈 위의 빨간 점 하나가 아침을 엽서처럼 만든다.
+    speaker_id: "Seulgi (KO)"
+    model: "KaniTTS"
+    temperature: 1.4
+    top_p: 0.95
+    repetition_penalty: 1.1
+    max_len: 1200
+  - text: >-
+      这种小雀鸟在冬季常见于树林与花园。
+    speaker_id: "Ming (ZH Shanghai OpenAI)"
+    model: "KaniTTS"
+    temperature: 1.4
+    top_p: 0.95
+    repetition_penalty: 1.1
+    max_len: 1200
+  - text: >-
+      طائرٌ صغير يُرى كثيرًا في حدائق الشتاء والغابات.
+    speaker_id: "Karim (AR)"
+    model: "KaniTTS"
+    temperature: 1.4
+    top_p: 0.95
+    repetition_penalty: 1.1
+    max_len: 1200
+  - text: >-
+      Great, and just a couple quick questions so we can match you with the right buyer. Is your home address still 330 East Charleston Road?
+    model: "Base Model v.0.2"
+    temperature: 1.4
+    top_p: 0.95
+    repetition_penalty: 1.1
+    max_len: 1200
+  - text: >-
+      Colleges of Oxford, Cambridge, Durham and the University of the Highlands and Islands UHI are 'listed bodies', as bodies that appear to the Secretary of State to be constituent colleges, schools, halls or other institutions of a university.
+    model: "Base Model v.0.2"
+    temperature: 1.4
+    top_p: 0.95
+    repetition_penalty: 1.1
+    max_len: 1200

model_config.yaml ADDED Viewed

	@@ -0,0 +1,36 @@

+nemo_player:
+  audiocodec_name: nvidia/nemo-nano-codec-22khz-0.6kbps-12.5fps
+  tokeniser_length: 64400
+  start_of_text: 1
+  end_of_text: 2
+models:
+  "KaniTTS":
+    model_name: nineninesix/kani-tts-370m
+    device_map: auto
+    speaker_id:
+      "David (EN British)": david
+      "Puck (EN Gemini)": puck
+      "Kore (EN Gemini)": kore
+      "Andrew (EN)": andrew
+      "Jenny (EN Irish)": jenny
+      "Simon (EN Unstable)": simon
+      "Katie (EN Unstable)": katie
+      "Seulgi (KO)": seulgi
+      "Bert (DE)": bert
+      "Thorsten (DE Hessisch)": thorsten
+      "Maria (ES)": maria
+      "Mei (ZH Cantonese)": mei
+      "Ming (ZH Shanghai OpenAI)": ming
+      "Karim (AR)": karim
+      "Nur (AR)": nur
+  "Base Model v.0.2":
+    model_name: nineninesix/kani-tts-450m-0.2-pt
+    device_map: auto
+  "Base Model v.0.1":
+    model_name: nineninesix/kani-tts-450m-0.1-pt
+    device_map: auto

requirements.txt CHANGED Viewed

@@ -2,4 +2,5 @@ torch==2.8.0
 librosa==0.11.0
 nemo_toolkit[tts]==2.4.0
 numpy==1.26.4
-gradio>=4.0.0

 librosa==0.11.0
 nemo_toolkit[tts]==2.4.0
 numpy==1.26.4
+gradio>=4.0.0
+omegaconf==2.3.0

util.py CHANGED Viewed

@@ -3,26 +3,66 @@ import librosa
 import requests
 import time
 from nemo.collections.tts.models import AudioCodecModel
-from dataclasses import dataclass
 from transformers import AutoTokenizer, AutoModelForCausalLM
 import os
-@dataclass
-class Config:
-    model_name: str = "nineninesix/kani-tts-450m-0.1-pt"
-    audiocodec_name: str = "nvidia/nemo-nano-codec-22khz-0.6kbps-12.5fps"
-    device_map: str = "auto"
-    tokeniser_length: int = 64400
-    start_of_text: int = 1
-    end_of_text: int = 2
-    max_new_tokens: int = 1200
-    temperature: float = 1.4
-    top_p: float = .95
-    repetition_penalty: float = 1.1
 class NemoAudioPlayer:
     def __init__(self, config, text_tokenizer_name: str = None) -> None:
         self.conf = config
         print(f"Loading NeMo codec model: {self.conf.audiocodec_name}")
@@ -130,6 +170,41 @@ class NemoAudioPlayer:
 class KaniModel:
     def __init__(self, config, player: NemoAudioPlayer, token: str) -> None:
         self.conf = config
         self.player = player
@@ -155,14 +230,17 @@ class KaniModel:
         print(f"Model loaded successfully on device: {next(self.model.parameters()).device}")
-    def get_input_ids(self, text_prompt: str) -> tuple[torch.tensor]:
         """Prepare input tokens for the model"""
         START_OF_HUMAN = self.player.start_of_human
         END_OF_TEXT = self.player.end_of_text
         END_OF_HUMAN = self.player.end_of_human
         # Tokenize input text
-        input_ids = self.tokenizer(text_prompt, return_tensors="pt").input_ids
         # Add special tokens
         start_token = torch.tensor([[START_OF_HUMAN]], dtype=torch.int64)
@@ -207,10 +285,10 @@ class KaniModel:
         report = f"SPEECH TOKENS: {model_request:.2f}\nCODEC: {player_time:.2f}\nTOTAL: {total_time:.2f}"
         return report
-    def run_model(self, text: str, t: float, top_p: float, rp: float, max_tok: int):
         """Complete pipeline: text -> tokens -> generation -> audio"""
         # Prepare input
-        input_ids, attention_mask = self.get_input_ids(text)
         # Generate tokens
         point_1 = time.time()
@@ -223,3 +301,86 @@ class KaniModel:
         point_3 = time.time()
         return audio, text, self.time_report(point_1, point_2, point_3)

 import requests
 import time
 from nemo.collections.tts.models import AudioCodecModel
 from transformers import AutoTokenizer, AutoModelForCausalLM
 import os
+from omegaconf import OmegaConf, DictConfig
+def load_config(config_path: str):
+    """Load configuration from a YAML file using OmegaConf.
+    Args:
+        config_path (str): Path to the YAML configuration file.
+    Returns:
+        Any: The loaded OmegaConf DictConfig.
+    """
+    resolved_path = os.path.abspath(config_path)
+    if not os.path.exists(resolved_path):
+        raise FileNotFoundError(f"Config file not found: {resolved_path}")
+    config = OmegaConf.load(resolved_path)
+    return config
 class NemoAudioPlayer:
+    """
+    High-level audio reconstruction helper built on NeMo Nano Codec.
+    This class converts discrete codec token sequences produced by the
+    language model into time-domain audio waveforms using
+    `nemo.collections.tts.models.AudioCodecModel`. It also optionally
+    handles extraction/decoding of text spans from the generated token
+    stream when a compatible text tokenizer is provided.
+    Parameters
+    ----------
+    config : OmegaConf | DictConfig
+        Configuration block under `nemo_player` from `model_config.yaml`.
+        Expected fields:
+            - `audiocodec_name` (str): HuggingFace model id for NeMo codec
+            - `tokeniser_length` (int): Size of the base tokenizer vocabulary
+            - `start_of_text`, `end_of_text` (int): Special text token ids
+    text_tokenizer_name : str, optional
+        HF repo id or local path of the tokenizer used by the LLM. If
+        provided, the player can also extract and decode the text segment
+        embedded in the generated ids for debugging/inspection.
+    Notes
+    -----
+    - The class defines a fixed layout of special token ids derived from
+        `tokeniser_length`. Audio codes are expected to be arranged in 4
+        interleaved codebooks (q=4). See `get_nano_codes` for validation.
+    - Device selection is automatic (`cuda` if available else `cpu`).
+    Typical Usage
+    -------------
+    1) The model generates a sequence of token ids that contains both text
+        and audio sections delimited by special markers.
+    2) Call `get_waveform(model_output_ids)` to obtain a NumPy waveform
+        ready to be played or saved.
+    """
     def __init__(self, config, text_tokenizer_name: str = None) -> None:
         self.conf = config
         print(f"Loading NeMo codec model: {self.conf.audiocodec_name}")
 class KaniModel:
+    """
+    Wrapper around a causal LLM that emits NeMo codec tokens for TTS.
+    Responsibilities
+    -----------------
+    - Load the LLM and tokenizer from HuggingFace with the provided
+        configuration (model id, device mapping, auth token, and
+        `trust_remote_code`).
+    - Prepare inputs by injecting conversation and modality control tokens
+        expected by the decoder (`START_OF_HUMAN`, `END_OF_TEXT`, etc.), and
+        optionally prefix the input with a speaker id tag.
+    - Perform generation with sampling parameters and return raw token ids.
+    - Delegate waveform reconstruction to `NemoAudioPlayer`.
+    Parameters
+    ----------
+    config : OmegaConf | DictConfig
+        Model configuration block from `models[...]` in `model_config.yaml`.
+        Expected fields:
+            - `model_name` (str): HF repo id of the LLM
+            - `device_map` (str | dict): Device mapping strategy for HF
+    player : NemoAudioPlayer
+        Audio decoder that turns generated token ids into waveform.
+    token : str
+        HuggingFace access token (if the model requires authentication).
+    Key Methods
+    -----------
+    - `get_input_ids(text, speaker_id)`: builds the prompt with control
+        tokens and returns `(input_ids, attention_mask)` tensors.
+    - `model_request(...)`: runs `generate` with sampling controls.
+    - `run_model(...)`: end-to-end pipeline returning `(audio, text, report)`.
+    """
     def __init__(self, config, player: NemoAudioPlayer, token: str) -> None:
         self.conf = config
         self.player = player
         print(f"Model loaded successfully on device: {next(self.model.parameters()).device}")
+    def get_input_ids(self, text_prompt: str, speaker_id:str) -> tuple[torch.tensor]:
         """Prepare input tokens for the model"""
         START_OF_HUMAN = self.player.start_of_human
         END_OF_TEXT = self.player.end_of_text
         END_OF_HUMAN = self.player.end_of_human
         # Tokenize input text
+        if speaker_id is not None:
+            input_ids = self.tokenizer(f"{speaker_id}: {text_prompt}", return_tensors="pt").input_ids
+        else:
+            input_ids = self.tokenizer(text_prompt, return_tensors="pt").input_ids
         # Add special tokens
         start_token = torch.tensor([[START_OF_HUMAN]], dtype=torch.int64)
         report = f"SPEECH TOKENS: {model_request:.2f}\nCODEC: {player_time:.2f}\nTOTAL: {total_time:.2f}"
         return report
+    def run_model(self, text: str, speaker_id:str, t: float, top_p: float, rp: float, max_tok: int):
         """Complete pipeline: text -> tokens -> generation -> audio"""
         # Prepare input
+        input_ids, attention_mask = self.get_input_ids(text, speaker_id)
         # Generate tokens
         point_1 = time.time()
         point_3 = time.time()
         return audio, text, self.time_report(point_1, point_2, point_3)
+class InitModels:
+    """
+    Lazy initializer that constructs a map of model name -> KaniModel.
+    Parameters
+    ----------
+    models_configs : OmegaConf | DictConfig
+        The `models` section from `model_config.yaml` describing one or
+        more HF LLM checkpoints and their options (device map, speakers).
+    player : NemoAudioPlayer
+        Shared audio decoder instance reused across all models.
+    token_ : str
+        HuggingFace token passed to each `KaniModel` for loading.
+    Returns
+    -------
+    dict
+        When called, returns a dictionary `{model_name: KaniModel}`.
+    Notes
+    -----
+    - All models are loaded immediately in `__call__` so the UI can list
+    them and switch between them without extra latency.
+    """
+    def __init__(self, models_configs:OmegaConf, player: NemoAudioPlayer, token_:str):
+        self.models_configs = models_configs
+        self.player = player
+        self.token_ = token_
+    def __call__(self):
+        models = {}
+        for model_name, config in self.models_configs.items():
+            print(f"Loading {model_name}...")
+            models[model_name] = KaniModel(config, self.player, self.token_)
+            print(f"{model_name} loaded!")
+        print("All models loaded!")
+        return models
+class Examples:
+    """
+    Adapter that converts YAML examples into Gradio `gr.Examples` rows.
+    Parameters
+    ----------
+    exam_cfg : OmegaConf | DictConfig
+        Parsed contents of `examples.yaml`. Expected structure:
+        `examples: [ {text, speaker_id?, model, temperature?, top_p?,
+        repetition_penalty?, max_len?}, ... ]`.
+    Behavior
+    --------
+    - Produces a list-of-lists whose order must match the `inputs` order
+        used when constructing `gr.Examples` in `app.py`.
+    - Current order: `[text, model_dropdown, speaker_dropdown, temp,
+        top_p, rp, max_tok]`.
+    Why this exists
+    ---------------
+    - Keeps format and defaults centralized, so changing the UI inputs
+        order only requires a single change here and in `app.py`.
+    """
+    def __init__(self, exam_cfg: OmegaConf):
+        self.exam_cfg = exam_cfg
+    def __call__(self)->list[list]:
+        rows = []
+        for e in self.exam_cfg.examples:
+            text                = e.get("text")
+            speaker_id          = e.get("speaker_id")
+            model               = e.get("model")
+            temperature         = e.get("temperature", 1.4)
+            top_p               = e.get("top_p", 0.95)
+            repetition_penalty  = e.get("repetition_penalty", 1.1)
+            max_len             = e.get("max_len", 1200)
+            # Order must match gr.Examples inputs: [text, model_dropdown, speaker_dropdown, temp, top_p, rp, max_tok]
+            rows.append([text, model, speaker_id, temperature, top_p, repetition_penalty, max_len])
+        return rows