Den Pavloff commited on
Commit
eb18e14
·
1 Parent(s): 2759e04

multispeaker, multilang

Browse files
Files changed (6) hide show
  1. app.py +60 -64
  2. create_env.py +21 -0
  3. examples.yaml +98 -0
  4. model_config.yaml +36 -0
  5. requirements.txt +2 -1
  6. util.py +178 -17
app.py CHANGED
@@ -1,65 +1,32 @@
1
- import os
2
- import subprocess
3
- import sys
4
-
5
- # Fix OMP_NUM_THREADS issue before any imports
6
- os.environ["OMP_NUM_THREADS"] = "4"
7
 
8
- # Install dependencies programmatically to avoid conflicts
9
- def setup_dependencies():
10
- try:
11
- # Check if already installed
12
- if os.path.exists('/tmp/deps_installed'):
13
- return
14
-
15
- print("Installing transformers dev version...")
16
- subprocess.check_call([
17
- sys.executable, "-m", "pip", "install", "--force-reinstall", "--no-cache-dir",
18
- "git+https://github.com/huggingface/transformers.git"
19
- ])
20
-
21
- # Mark as installed
22
- with open('/tmp/deps_installed', 'w') as f:
23
- f.write('done')
24
-
25
- except Exception as e:
26
- print(f"Dependencies setup error: {e}")
27
-
28
- # Run setup
29
  setup_dependencies()
30
 
31
  import spaces
32
  import gradio as gr
33
- from util import Config, NemoAudioPlayer, KaniModel
34
  import numpy as np
35
  import torch
 
36
 
37
  # Get HuggingFace token
38
  token_ = os.getenv('HF_TOKEN')
39
 
40
- # Model configurations
41
- models_configs = {
42
- 'base': Config(),
43
- 'female': Config(
44
- model_name='nineninesix/kani-tts-450m-0.2-ft',
45
- ),
46
- 'male': Config(
47
- model_name='nineninesix/kani-tts-450m-0.1-ft',
48
- )
49
- }
50
 
51
- # Global variables for models (loaded once)
52
- player = NemoAudioPlayer(Config())
53
- models = {}
54
- for model_name, config in models_configs.items():
55
- print(f"Loading {model_name}...")
56
- models[model_name] = KaniModel(config, player, token_)
57
- print(f"{model_name} loaded!")
58
- print("All models loaded!")
59
 
60
 
61
  @spaces.GPU
62
- def generate_speech_gpu(text, model_choice, t, top_p, rp, max_tok):
63
  """
64
  Generate speech from text using the selected model on GPU
65
  """
@@ -71,16 +38,19 @@ def generate_speech_gpu(text, model_choice, t, top_p, rp, max_tok):
71
  return None, "Please select a model."
72
 
73
  try:
74
- # Check GPU availability
75
  device = "cuda" if torch.cuda.is_available() else "cpu"
76
  print(f"Using device: {device}")
77
 
78
- # Get selected model
79
  selected_model = models[model_choice]
 
 
 
 
 
 
80
 
81
- # Generate audio
82
  print(f"Generating speech with {model_choice}...")
83
- audio, _, time_report = selected_model.run_model(text, t, top_p, rp, max_tok)
84
 
85
  sample_rate = 22050
86
  print("Speech generation completed!")
@@ -104,6 +74,20 @@ with gr.Blocks(title="😻 KaniTTS - Text to Speech", theme=gr.themes.Default())
104
  label="Selected Model",
105
  info="Base generates random voices"
106
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
107
 
108
  text_input = gr.Textbox(
109
  label="Text",
@@ -146,30 +130,42 @@ with gr.Blocks(title="😻 KaniTTS - Text to Speech", theme=gr.themes.Default())
146
  lines=3
147
  )
148
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
149
  # GPU generation event
150
  generate_btn.click(
151
  fn=generate_speech_gpu,
152
- inputs=[text_input, model_dropdown, temp, top_p, rp, max_tok],
153
  outputs=[audio_output, time_report_output]
154
  )
155
 
156
  with gr.Row():
157
 
158
- examples = [
159
- ["Anyway, um, so, um, tell me, tell me all about her. I mean, what's she like? Is she really, you know, pretty?", "male", 1.4, 0.95, 1.1, 1200],
160
- ["No, that does not make you a failure. No, sweetie, no. It just, uh, it just means that you're having a tough time...", "male", 1.4, 0.95, 1.1, 1200],
161
- ["I-- Oh, I am such an idiot sometimes. I'm so sorry. Um, I-I don't know where my head's at.", "male", 1.4, 0.95, 1.1, 1200],
162
- ["Got it. $300,000. I can definitely help you get a very good price for your property by selecting a realtor.", "female", 1.4, 0.95, 1.1, 1200],
163
- ["Holy fu- Oh my God! Don't you understand how dangerous it is?", "male", 1.4, 0.95, 1.1, 1200],
164
- ["You make my days brighter, and my wildest dreams feel like reality. How do you do that?", "female", 1.4, 0.95, 1.1, 1200],
165
- ["Great, and just a couple quick questions so we can match you with the right buyer. Is your home address still 330 East Charleston Road?", "base", 1.4, 0.95, 1.1, 1200],
166
- ["Oh, yeah. I mean did you want to get a quick snack together or maybe something before you go?", "female", 1.4, 0.95, 1.1, 1200],
167
- ]
168
-
169
 
170
  gr.Examples(
171
  examples=examples,
172
- inputs=[text_input, model_dropdown, temp, top_p, rp, max_tok],
173
  fn=generate_speech_gpu,
174
  outputs=[audio_output, time_report_output],
175
  cache_examples=True,
 
1
+ from create_env import setup_dependencies
 
 
 
 
 
2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  setup_dependencies()
4
 
5
  import spaces
6
  import gradio as gr
7
+ from util import NemoAudioPlayer, InitModels, load_config, Examples
8
  import numpy as np
9
  import torch
10
+ import os
11
 
12
  # Get HuggingFace token
13
  token_ = os.getenv('HF_TOKEN')
14
 
15
+ config = load_config("./model_config.yaml")
16
+ models_configs = config.models
17
+ nemo_player_cfg = config.nemo_player
18
+
19
+ examples_cfg = load_config("./examples.yaml")
20
+ examples_maker = Examples(examples_cfg)
21
+ examples = examples_maker()
 
 
 
22
 
23
+ player = NemoAudioPlayer(nemo_player_cfg)
24
+ init_models = InitModels(models_configs, player, token_)
25
+ models = init_models()
 
 
 
 
 
26
 
27
 
28
  @spaces.GPU
29
+ def generate_speech_gpu(text, model_choice, speaker_display: str, t, top_p, rp, max_tok):
30
  """
31
  Generate speech from text using the selected model on GPU
32
  """
 
38
  return None, "Please select a model."
39
 
40
  try:
 
41
  device = "cuda" if torch.cuda.is_available() else "cpu"
42
  print(f"Using device: {device}")
43
 
 
44
  selected_model = models[model_choice]
45
+ cfg = models_configs.get(model_choice)
46
+ speaker_map = cfg.get('speaker_id', {}) if cfg is not None else {}
47
+ if speaker_display and speaker_map:
48
+ speaker_id = speaker_map.get(speaker_display)
49
+ else:
50
+ speaker_id = None
51
 
 
52
  print(f"Generating speech with {model_choice}...")
53
+ audio, _, time_report = selected_model.run_model(text, speaker_id, t, top_p, rp, max_tok)
54
 
55
  sample_rate = 22050
56
  print("Speech generation completed!")
 
74
  label="Selected Model",
75
  info="Base generates random voices"
76
  )
77
+ # Speaker selector (shown only if model has speakers)
78
+ # Pre-populate all available speakers for example table rendering
79
+ all_speakers = []
80
+ for _cfg in models_configs.values():
81
+ if _cfg and _cfg.get('speaker_id'):
82
+ all_speakers.extend(list(_cfg.speaker_id.keys()))
83
+ all_speakers = sorted(list(set(all_speakers)))
84
+ speaker_dropdown = gr.Dropdown(
85
+ choices=all_speakers,
86
+ value=None,
87
+ label="Speaker",
88
+ visible=False,
89
+ allow_custom_value=True
90
+ )
91
 
92
  text_input = gr.Textbox(
93
  label="Text",
 
130
  lines=3
131
  )
132
 
133
+ # Update speakers when model changes
134
+ def update_speakers(model_choice):
135
+ cfg = models_configs.get(model_choice)
136
+ speakers = list(cfg.speaker_id.keys()) if (cfg and cfg.get('speaker_id')) else []
137
+ if speakers:
138
+ return gr.update(choices=speakers, value=speakers[0], visible=True)
139
+ else:
140
+ return gr.update(choices=[], value=None, visible=False)
141
+
142
+ model_dropdown.change(
143
+ fn=update_speakers,
144
+ inputs=[model_dropdown],
145
+ outputs=[speaker_dropdown]
146
+ )
147
+
148
+ # Populate speakers on initial page load based on default model
149
+ demo.load(
150
+ fn=update_speakers,
151
+ inputs=[model_dropdown],
152
+ outputs=[speaker_dropdown]
153
+ )
154
+
155
  # GPU generation event
156
  generate_btn.click(
157
  fn=generate_speech_gpu,
158
+ inputs=[text_input, model_dropdown, speaker_dropdown, temp, top_p, rp, max_tok],
159
  outputs=[audio_output, time_report_output]
160
  )
161
 
162
  with gr.Row():
163
 
164
+ examples = examples
 
 
 
 
 
 
 
 
 
 
165
 
166
  gr.Examples(
167
  examples=examples,
168
+ inputs=[text_input, model_dropdown, speaker_dropdown, temp, top_p, rp, max_tok],
169
  fn=generate_speech_gpu,
170
  outputs=[audio_output, time_report_output],
171
  cache_examples=True,
create_env.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import subprocess
3
+ import sys
4
+
5
+ def setup_dependencies():
6
+ os.environ["OMP_NUM_THREADS"] = "4"
7
+ try:
8
+ if os.path.exists('/tmp/deps_installed'):
9
+ return
10
+
11
+ print("Installing transformers dev version...")
12
+ subprocess.check_call([
13
+ sys.executable, "-m", "pip", "install", "--force-reinstall", "--no-cache-dir",
14
+ "git+https://github.com/huggingface/transformers.git"
15
+ ])
16
+
17
+ with open('/tmp/deps_installed', 'w') as f:
18
+ f.write('done')
19
+
20
+ except Exception as e:
21
+ print(f"Dependencies setup error: {e}")
examples.yaml ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ examples:
2
+ - text: >-
3
+ Anyway, um, so, um, tell me, tell me all about her. I mean, what's she like? Is she really, you know, pretty?
4
+ speaker_id: "Puck (EN Gemini)"
5
+ model: "KaniTTS"
6
+ temperature: 1.4
7
+ top_p: 0.95
8
+ repetition_penalty: 1.1
9
+ max_len: 1200
10
+
11
+ - text: >-
12
+ No, that does not make you a failure. No, sweetie, no. It just, uh, it just means that you're having a tough time...
13
+ speaker_id: "Kore (EN Gemini)"
14
+ model: "KaniTTS"
15
+ temperature: 1.4
16
+ top_p: 0.95
17
+ repetition_penalty: 1.1
18
+ max_len: 1200
19
+
20
+ - text: >-
21
+ Holy fu* Oh my God! Don't you understand how dangerous it is, huh?
22
+ speaker_id: "Andrew (EN)"
23
+ model: "KaniTTS"
24
+ temperature: 1.4
25
+ top_p: 0.95
26
+ repetition_penalty: 1.1
27
+ max_len: 1200
28
+
29
+ - text: >-
30
+ Got it. $300,000. I can definitely help you get a very good price for your property by selecting a realtor.
31
+ speaker_id: "David (EN British)"
32
+ model: "KaniTTS"
33
+ temperature: 1.4
34
+ top_p: 0.95
35
+ repetition_penalty: 1.1
36
+ max_len: 1200
37
+
38
+ - text: >-
39
+ I-- Oh, I am such an idiot sometimes. I'm so sorry. Um, I-I don't know where my head's at.
40
+ speaker_id: "Jenny (EN Irish)"
41
+ model: "KaniTTS"
42
+ temperature: 1.4
43
+ top_p: 0.95
44
+ repetition_penalty: 1.1
45
+ max_len: 1200
46
+
47
+
48
+ - text: >-
49
+ Der Dompfaff ist ein kleiner Fink, der im Winter oft in Gärten zu sehen ist.
50
+ speaker_id: "Thorsten (DE Hessisch)"
51
+ model: "KaniTTS"
52
+ temperature: 1.4
53
+ top_p: 0.95
54
+ repetition_penalty: 1.1
55
+ max_len: 1200
56
+
57
+ - text: >-
58
+ 하얀 눈 위의 빨간 점 하나가 아침을 엽서처럼 만든다.
59
+ speaker_id: "Seulgi (KO)"
60
+ model: "KaniTTS"
61
+ temperature: 1.4
62
+ top_p: 0.95
63
+ repetition_penalty: 1.1
64
+ max_len: 1200
65
+
66
+ - text: >-
67
+ 这种小雀鸟在冬季常见于树林与花园。
68
+ speaker_id: "Ming (ZH Shanghai OpenAI)"
69
+ model: "KaniTTS"
70
+ temperature: 1.4
71
+ top_p: 0.95
72
+ repetition_penalty: 1.1
73
+ max_len: 1200
74
+
75
+ - text: >-
76
+ طائرٌ صغير يُرى كثيرًا في حدائق الشتاء والغابات.
77
+ speaker_id: "Karim (AR)"
78
+ model: "KaniTTS"
79
+ temperature: 1.4
80
+ top_p: 0.95
81
+ repetition_penalty: 1.1
82
+ max_len: 1200
83
+
84
+ - text: >-
85
+ Great, and just a couple quick questions so we can match you with the right buyer. Is your home address still 330 East Charleston Road?
86
+ model: "Base Model v.0.2"
87
+ temperature: 1.4
88
+ top_p: 0.95
89
+ repetition_penalty: 1.1
90
+ max_len: 1200
91
+
92
+ - text: >-
93
+ Colleges of Oxford, Cambridge, Durham and the University of the Highlands and Islands UHI are 'listed bodies', as bodies that appear to the Secretary of State to be constituent colleges, schools, halls or other institutions of a university.
94
+ model: "Base Model v.0.2"
95
+ temperature: 1.4
96
+ top_p: 0.95
97
+ repetition_penalty: 1.1
98
+ max_len: 1200
model_config.yaml ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ nemo_player:
2
+ audiocodec_name: nvidia/nemo-nano-codec-22khz-0.6kbps-12.5fps
3
+ tokeniser_length: 64400
4
+ start_of_text: 1
5
+ end_of_text: 2
6
+
7
+ models:
8
+
9
+ "KaniTTS":
10
+ model_name: nineninesix/kani-tts-370m
11
+ device_map: auto
12
+ speaker_id:
13
+ "David (EN British)": david
14
+ "Puck (EN Gemini)": puck
15
+ "Kore (EN Gemini)": kore
16
+ "Andrew (EN)": andrew
17
+ "Jenny (EN Irish)": jenny
18
+ "Simon (EN Unstable)": simon
19
+ "Katie (EN Unstable)": katie
20
+ "Seulgi (KO)": seulgi
21
+ "Bert (DE)": bert
22
+ "Thorsten (DE Hessisch)": thorsten
23
+ "Maria (ES)": maria
24
+ "Mei (ZH Cantonese)": mei
25
+ "Ming (ZH Shanghai OpenAI)": ming
26
+ "Karim (AR)": karim
27
+ "Nur (AR)": nur
28
+
29
+ "Base Model v.0.2":
30
+ model_name: nineninesix/kani-tts-450m-0.2-pt
31
+ device_map: auto
32
+
33
+ "Base Model v.0.1":
34
+ model_name: nineninesix/kani-tts-450m-0.1-pt
35
+ device_map: auto
36
+
requirements.txt CHANGED
@@ -2,4 +2,5 @@ torch==2.8.0
2
  librosa==0.11.0
3
  nemo_toolkit[tts]==2.4.0
4
  numpy==1.26.4
5
- gradio>=4.0.0
 
 
2
  librosa==0.11.0
3
  nemo_toolkit[tts]==2.4.0
4
  numpy==1.26.4
5
+ gradio>=4.0.0
6
+ omegaconf==2.3.0
util.py CHANGED
@@ -3,26 +3,66 @@ import librosa
3
  import requests
4
  import time
5
  from nemo.collections.tts.models import AudioCodecModel
6
- from dataclasses import dataclass
7
  from transformers import AutoTokenizer, AutoModelForCausalLM
8
  import os
 
9
 
10
 
11
- @dataclass
12
- class Config:
13
- model_name: str = "nineninesix/kani-tts-450m-0.1-pt"
14
- audiocodec_name: str = "nvidia/nemo-nano-codec-22khz-0.6kbps-12.5fps"
15
- device_map: str = "auto"
16
- tokeniser_length: int = 64400
17
- start_of_text: int = 1
18
- end_of_text: int = 2
19
- max_new_tokens: int = 1200
20
- temperature: float = 1.4
21
- top_p: float = .95
22
- repetition_penalty: float = 1.1
 
 
23
 
24
 
25
  class NemoAudioPlayer:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  def __init__(self, config, text_tokenizer_name: str = None) -> None:
27
  self.conf = config
28
  print(f"Loading NeMo codec model: {self.conf.audiocodec_name}")
@@ -130,6 +170,41 @@ class NemoAudioPlayer:
130
 
131
 
132
  class KaniModel:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
133
  def __init__(self, config, player: NemoAudioPlayer, token: str) -> None:
134
  self.conf = config
135
  self.player = player
@@ -155,14 +230,17 @@ class KaniModel:
155
 
156
  print(f"Model loaded successfully on device: {next(self.model.parameters()).device}")
157
 
158
- def get_input_ids(self, text_prompt: str) -> tuple[torch.tensor]:
159
  """Prepare input tokens for the model"""
160
  START_OF_HUMAN = self.player.start_of_human
161
  END_OF_TEXT = self.player.end_of_text
162
  END_OF_HUMAN = self.player.end_of_human
163
 
164
  # Tokenize input text
165
- input_ids = self.tokenizer(text_prompt, return_tensors="pt").input_ids
 
 
 
166
 
167
  # Add special tokens
168
  start_token = torch.tensor([[START_OF_HUMAN]], dtype=torch.int64)
@@ -207,10 +285,10 @@ class KaniModel:
207
  report = f"SPEECH TOKENS: {model_request:.2f}\nCODEC: {player_time:.2f}\nTOTAL: {total_time:.2f}"
208
  return report
209
 
210
- def run_model(self, text: str, t: float, top_p: float, rp: float, max_tok: int):
211
  """Complete pipeline: text -> tokens -> generation -> audio"""
212
  # Prepare input
213
- input_ids, attention_mask = self.get_input_ids(text)
214
 
215
  # Generate tokens
216
  point_1 = time.time()
@@ -223,3 +301,86 @@ class KaniModel:
223
  point_3 = time.time()
224
  return audio, text, self.time_report(point_1, point_2, point_3)
225
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  import requests
4
  import time
5
  from nemo.collections.tts.models import AudioCodecModel
 
6
  from transformers import AutoTokenizer, AutoModelForCausalLM
7
  import os
8
+ from omegaconf import OmegaConf, DictConfig
9
 
10
 
11
+ def load_config(config_path: str):
12
+ """Load configuration from a YAML file using OmegaConf.
13
+
14
+ Args:
15
+ config_path (str): Path to the YAML configuration file.
16
+
17
+ Returns:
18
+ Any: The loaded OmegaConf DictConfig.
19
+ """
20
+ resolved_path = os.path.abspath(config_path)
21
+ if not os.path.exists(resolved_path):
22
+ raise FileNotFoundError(f"Config file not found: {resolved_path}")
23
+ config = OmegaConf.load(resolved_path)
24
+ return config
25
 
26
 
27
  class NemoAudioPlayer:
28
+
29
+ """
30
+ High-level audio reconstruction helper built on NeMo Nano Codec.
31
+
32
+ This class converts discrete codec token sequences produced by the
33
+ language model into time-domain audio waveforms using
34
+ `nemo.collections.tts.models.AudioCodecModel`. It also optionally
35
+ handles extraction/decoding of text spans from the generated token
36
+ stream when a compatible text tokenizer is provided.
37
+
38
+ Parameters
39
+ ----------
40
+ config : OmegaConf | DictConfig
41
+ Configuration block under `nemo_player` from `model_config.yaml`.
42
+ Expected fields:
43
+ - `audiocodec_name` (str): HuggingFace model id for NeMo codec
44
+ - `tokeniser_length` (int): Size of the base tokenizer vocabulary
45
+ - `start_of_text`, `end_of_text` (int): Special text token ids
46
+ text_tokenizer_name : str, optional
47
+ HF repo id or local path of the tokenizer used by the LLM. If
48
+ provided, the player can also extract and decode the text segment
49
+ embedded in the generated ids for debugging/inspection.
50
+
51
+ Notes
52
+ -----
53
+ - The class defines a fixed layout of special token ids derived from
54
+ `tokeniser_length`. Audio codes are expected to be arranged in 4
55
+ interleaved codebooks (q=4). See `get_nano_codes` for validation.
56
+ - Device selection is automatic (`cuda` if available else `cpu`).
57
+
58
+ Typical Usage
59
+ -------------
60
+ 1) The model generates a sequence of token ids that contains both text
61
+ and audio sections delimited by special markers.
62
+ 2) Call `get_waveform(model_output_ids)` to obtain a NumPy waveform
63
+ ready to be played or saved.
64
+ """
65
+
66
  def __init__(self, config, text_tokenizer_name: str = None) -> None:
67
  self.conf = config
68
  print(f"Loading NeMo codec model: {self.conf.audiocodec_name}")
 
170
 
171
 
172
  class KaniModel:
173
+
174
+ """
175
+ Wrapper around a causal LLM that emits NeMo codec tokens for TTS.
176
+
177
+ Responsibilities
178
+ -----------------
179
+ - Load the LLM and tokenizer from HuggingFace with the provided
180
+ configuration (model id, device mapping, auth token, and
181
+ `trust_remote_code`).
182
+ - Prepare inputs by injecting conversation and modality control tokens
183
+ expected by the decoder (`START_OF_HUMAN`, `END_OF_TEXT`, etc.), and
184
+ optionally prefix the input with a speaker id tag.
185
+ - Perform generation with sampling parameters and return raw token ids.
186
+ - Delegate waveform reconstruction to `NemoAudioPlayer`.
187
+
188
+ Parameters
189
+ ----------
190
+ config : OmegaConf | DictConfig
191
+ Model configuration block from `models[...]` in `model_config.yaml`.
192
+ Expected fields:
193
+ - `model_name` (str): HF repo id of the LLM
194
+ - `device_map` (str | dict): Device mapping strategy for HF
195
+ player : NemoAudioPlayer
196
+ Audio decoder that turns generated token ids into waveform.
197
+ token : str
198
+ HuggingFace access token (if the model requires authentication).
199
+
200
+ Key Methods
201
+ -----------
202
+ - `get_input_ids(text, speaker_id)`: builds the prompt with control
203
+ tokens and returns `(input_ids, attention_mask)` tensors.
204
+ - `model_request(...)`: runs `generate` with sampling controls.
205
+ - `run_model(...)`: end-to-end pipeline returning `(audio, text, report)`.
206
+ """
207
+
208
  def __init__(self, config, player: NemoAudioPlayer, token: str) -> None:
209
  self.conf = config
210
  self.player = player
 
230
 
231
  print(f"Model loaded successfully on device: {next(self.model.parameters()).device}")
232
 
233
+ def get_input_ids(self, text_prompt: str, speaker_id:str) -> tuple[torch.tensor]:
234
  """Prepare input tokens for the model"""
235
  START_OF_HUMAN = self.player.start_of_human
236
  END_OF_TEXT = self.player.end_of_text
237
  END_OF_HUMAN = self.player.end_of_human
238
 
239
  # Tokenize input text
240
+ if speaker_id is not None:
241
+ input_ids = self.tokenizer(f"{speaker_id}: {text_prompt}", return_tensors="pt").input_ids
242
+ else:
243
+ input_ids = self.tokenizer(text_prompt, return_tensors="pt").input_ids
244
 
245
  # Add special tokens
246
  start_token = torch.tensor([[START_OF_HUMAN]], dtype=torch.int64)
 
285
  report = f"SPEECH TOKENS: {model_request:.2f}\nCODEC: {player_time:.2f}\nTOTAL: {total_time:.2f}"
286
  return report
287
 
288
+ def run_model(self, text: str, speaker_id:str, t: float, top_p: float, rp: float, max_tok: int):
289
  """Complete pipeline: text -> tokens -> generation -> audio"""
290
  # Prepare input
291
+ input_ids, attention_mask = self.get_input_ids(text, speaker_id)
292
 
293
  # Generate tokens
294
  point_1 = time.time()
 
301
  point_3 = time.time()
302
  return audio, text, self.time_report(point_1, point_2, point_3)
303
 
304
+ class InitModels:
305
+
306
+ """
307
+ Lazy initializer that constructs a map of model name -> KaniModel.
308
+
309
+ Parameters
310
+ ----------
311
+ models_configs : OmegaConf | DictConfig
312
+ The `models` section from `model_config.yaml` describing one or
313
+ more HF LLM checkpoints and their options (device map, speakers).
314
+ player : NemoAudioPlayer
315
+ Shared audio decoder instance reused across all models.
316
+ token_ : str
317
+ HuggingFace token passed to each `KaniModel` for loading.
318
+
319
+ Returns
320
+ -------
321
+ dict
322
+ When called, returns a dictionary `{model_name: KaniModel}`.
323
+
324
+ Notes
325
+ -----
326
+ - All models are loaded immediately in `__call__` so the UI can list
327
+ them and switch between them without extra latency.
328
+ """
329
+
330
+ def __init__(self, models_configs:OmegaConf, player: NemoAudioPlayer, token_:str):
331
+ self.models_configs = models_configs
332
+ self.player = player
333
+ self.token_ = token_
334
+
335
+ def __call__(self):
336
+ models = {}
337
+ for model_name, config in self.models_configs.items():
338
+ print(f"Loading {model_name}...")
339
+ models[model_name] = KaniModel(config, self.player, self.token_)
340
+ print(f"{model_name} loaded!")
341
+ print("All models loaded!")
342
+ return models
343
+
344
+ class Examples:
345
+
346
+ """
347
+ Adapter that converts YAML examples into Gradio `gr.Examples` rows.
348
+
349
+ Parameters
350
+ ----------
351
+ exam_cfg : OmegaConf | DictConfig
352
+ Parsed contents of `examples.yaml`. Expected structure:
353
+ `examples: [ {text, speaker_id?, model, temperature?, top_p?,
354
+ repetition_penalty?, max_len?}, ... ]`.
355
+
356
+ Behavior
357
+ --------
358
+ - Produces a list-of-lists whose order must match the `inputs` order
359
+ used when constructing `gr.Examples` in `app.py`.
360
+ - Current order: `[text, model_dropdown, speaker_dropdown, temp,
361
+ top_p, rp, max_tok]`.
362
+
363
+ Why this exists
364
+ ---------------
365
+ - Keeps format and defaults centralized, so changing the UI inputs
366
+ order only requires a single change here and in `app.py`.
367
+ """
368
+
369
+ def __init__(self, exam_cfg: OmegaConf):
370
+ self.exam_cfg = exam_cfg
371
+
372
+ def __call__(self)->list[list]:
373
+ rows = []
374
+ for e in self.exam_cfg.examples:
375
+ text = e.get("text")
376
+ speaker_id = e.get("speaker_id")
377
+ model = e.get("model")
378
+ temperature = e.get("temperature", 1.4)
379
+ top_p = e.get("top_p", 0.95)
380
+ repetition_penalty = e.get("repetition_penalty", 1.1)
381
+ max_len = e.get("max_len", 1200)
382
+ # Order must match gr.Examples inputs: [text, model_dropdown, speaker_dropdown, temp, top_p, rp, max_tok]
383
+ rows.append([text, model, speaker_id, temperature, top_p, repetition_penalty, max_len])
384
+
385
+ return rows
386
+