Spaces:
Running
on
Zero
Running
on
Zero
| # The code in this file is almost entirely written by LLMs and is much, much, much messier than it needs to be (at this point it's not clear to what extent it is even human-modifiable). We'd hope to improve this for any future local gradio release(s). | |
| import tempfile | |
| import os | |
| import json | |
| import time | |
| import secrets | |
| import logging | |
| from pathlib import Path | |
| from typing import Tuple, Any | |
| from functools import partial | |
| os.environ['HF_HUB_DISABLE_PROGRESS_BARS'] = '1' | |
| logging.getLogger("huggingface_hub").setLevel(logging.ERROR) | |
| import warnings | |
| # Suppress torchaudio TorchCodec parameter warnings | |
| warnings.filterwarnings('ignore', message='.*encoding.*parameter is not fully supported by TorchCodec') | |
| warnings.filterwarnings('ignore', message='.*bits_per_sample.*parameter is not directly supported by TorchCodec') | |
| warnings.filterwarnings('ignore', message='.* is not used by TorchCodec AudioEncoder. Format is determined by the file extension.') | |
| import gradio as gr | |
| import numpy as np | |
| import torch | |
| import torchaudio | |
| from huggingface_hub import snapshot_download | |
| import spaces | |
| from inference import ( | |
| load_model_from_hf, | |
| load_fish_ae_from_hf, | |
| load_pca_state_from_hf, | |
| load_audio, | |
| ae_reconstruct, | |
| sample_pipeline | |
| ) | |
| from samplers import sample_euler_cfg_any, GuidanceMode | |
| import tarfile | |
| # -------------------------------------------------------------------- | |
| ### Configuration | |
| MODEL_DTYPE = torch.bfloat16 | |
| FISH_AE_DTYPE = torch.float32 | |
| # FISH_AE_DTYPE = torch.bfloat16 # MAYBE SLIGHTLY WORSE QUALITY, IF YOU HAVE ROOM, MAYBE USE FLOAT32 | |
| USE_16_BIT_WAV = True # Save WAV files as 16-bit PCM instead of 32-bit float | |
| # Audio Prompt Library for Custom Audio Panel (included in repo) | |
| AUDIO_PROMPT_FOLDER = Path("./prompt_audio") | |
| # If not on Zero GPU, compile fish_ae encoder/decoder on initialization | |
| COMPILE_FISH_IF_NOT_ON_ZERO_GPU = True | |
| # Silentcipher watermarking configuration | |
| USE_SILENTCIPHER = True # Enable/disable audio watermarking | |
| SILENTCIPHER_MESSAGE = [91, 57, 81, 60, 83] # Watermark message (list of integers) | |
| SILENTCIPHER_SDR = 47 # Message SDR in dB (higher = less perceptible but less robust) | |
| # Get HF token from environment for private model access | |
| HF_TOKEN = os.environ.get("HF_TOKEN", None) | |
| # -------------------------------------------------------------------- | |
| # Check if running on Zero GPU (compile incompatible with Zero GPU) | |
| IS_ZEROGPU = os.environ.get("SPACES_ZERO_GPU") is not None | |
| # print("FISH_AE_DTYPE:", FISH_AE_DTYPE) | |
| # print("IS_ZEROGPU:", IS_ZEROGPU) | |
| # if IS_ZEROGPU: | |
| # print("Running on Zero GPU - model compilation disabled") | |
| # else: | |
| # print("Not on Zero GPU - model compilation available") | |
| def _safe_members(tf, prefix): | |
| if not prefix.endswith('/'): | |
| prefix += '/' | |
| for m in tf.getmembers(): | |
| if not m.name.startswith(prefix): | |
| continue | |
| p = Path(m.name) | |
| if any(part == '..' for part in p.parts) or p.is_absolute(): | |
| continue | |
| yield m | |
| def ensure_tar_tree(repo_id: str, root: str, *, token: str | None = None, max_workers: int = 4): | |
| os.environ.setdefault('HF_HUB_ENABLE_HF_TRANSFER', '1') | |
| from huggingface_hub import snapshot_download | |
| base = Path(snapshot_download(repo_id=repo_id, repo_type='dataset', | |
| allow_patterns=[f'{root}.tar', 'index.jsonl', 'README.md', 'LICENSE'], | |
| resume_download=True, token=token, max_workers=max_workers)) | |
| root_dir = base / root | |
| if root_dir.exists(): | |
| return root_dir | |
| tar_path = base / f'{root}.tar' | |
| if not tar_path.exists(): | |
| raise FileNotFoundError(f'Expected {tar_path} in snapshot') | |
| with tarfile.open(tar_path, 'r') as tf: | |
| tf.extractall(base, members=_safe_members(tf, root)) | |
| return root_dir | |
| EARS_PATH = ensure_tar_tree(repo_id="jordand/echo-embeddings-ears-tar", root="EARS", token=HF_TOKEN) | |
| VCTK_PATH = ensure_tar_tree(repo_id="jordand/echo-embeddings-vctk-tar", root="VCTK", token=HF_TOKEN) | |
| EXPRESSO_PATH = ensure_tar_tree(repo_id="jordand/echo-embeddings-expresso-tar", root="Expresso", token=HF_TOKEN) | |
| from huggingface_hub import snapshot_download | |
| HF_CUSTOM_PATH = Path(snapshot_download( | |
| repo_id="jordand/echo-embeddings-custom", | |
| repo_type="dataset", | |
| allow_patterns=[ | |
| "HF-Custom/**/speaker_latent.safetensors", | |
| "HF-Custom/**/metadata.json", | |
| "HF-Custom/**/audio.mp3", | |
| ], | |
| token=HF_TOKEN, | |
| ) + "/HF-Custom") | |
| TEMP_AUDIO_DIR = Path('./temp_gradio_audio') | |
| TEMP_AUDIO_DIR.mkdir(parents=True, exist_ok=True) | |
| # Helper functions for unique filenames and cleanup | |
| def make_stem(prefix: str, user_id: str | None = None) -> str: | |
| """Create unique filename stem: prefix__user__timestamp_random or prefix__timestamp_random if no user_id.""" | |
| ts = int(time.time() * 1000) | |
| rand = secrets.token_hex(4) | |
| if user_id: | |
| return f"{prefix}__{user_id}__{ts}_{rand}" | |
| return f"{prefix}__{ts}_{rand}" | |
| def cleanup_temp_audio(dir_: Path, user_id: str | None, max_age_sec: int = 60 * 5): | |
| """Remove old files globally and all previous files for this user.""" | |
| now = time.time() | |
| # 1) Global TTL: remove any file older than max_age_sec | |
| for p in dir_.glob("*"): | |
| try: | |
| if p.is_file() and (now - p.stat().st_mtime) > max_age_sec: | |
| p.unlink(missing_ok=True) | |
| except Exception: | |
| pass | |
| # 2) Per-user: remove ALL previous files for this user (we don't need to keep any) | |
| if user_id: | |
| for p in dir_.glob(f"*__{user_id}__*"): | |
| try: | |
| if p.is_file(): | |
| p.unlink(missing_ok=True) | |
| except Exception: | |
| pass | |
| TEXT_PRESETS_PATH = Path('./text_presets.txt') | |
| SAMPLER_PRESETS_PATH = Path('./sampler_presets.json') | |
| # Global model variables (loaded lazily for Zero GPU) | |
| model = None | |
| model_compiled = None # Separate compiled model for toggling | |
| fish_ae = None | |
| pca_state = None | |
| silentcipher_model = None # Silentcipher watermarking model | |
| _model_compiled = False | |
| def load_models(): | |
| """Lazy load models on first use (required for Zero GPU).""" | |
| global model, model_compiled, fish_ae, pca_state, silentcipher_model | |
| if model is None: | |
| # print("Loading models from HuggingFace...") | |
| model = load_model_from_hf(dtype=MODEL_DTYPE, compile=False, token=HF_TOKEN) | |
| fish_ae = load_fish_ae_from_hf(compile=(COMPILE_FISH_IF_NOT_ON_ZERO_GPU and not IS_ZEROGPU), dtype=FISH_AE_DTYPE, token=HF_TOKEN) | |
| pca_state = load_pca_state_from_hf(token=HF_TOKEN) | |
| # Load silentcipher model if enabled | |
| if USE_SILENTCIPHER: | |
| try: | |
| import silentcipher | |
| # print("Loading silentcipher watermarking model...") | |
| silentcipher_model = silentcipher.get_model(model_type='44.1k', device='cuda') | |
| # print("Silentcipher model loaded successfully!") | |
| except Exception as e: | |
| print(f"Warning: Failed to load silentcipher model: {e}") | |
| print("Continuing without watermarking...") | |
| # print("Models loaded successfully!") | |
| # if not IS_ZEROGPU: | |
| # print("Note: model_compiled will be created when you check 'Compile Model'") | |
| def compile_model(should_compile): | |
| """Compile the model for faster inference.""" | |
| global model, model_compiled, _model_compiled | |
| # If on Zero GPU, compilation is not supported | |
| if IS_ZEROGPU: | |
| return gr.update(value=False, interactive=False), gr.update(value="⚠️ Compile disabled on Zero GPU", visible=True) | |
| if not should_compile: | |
| # User unchecked - clear status and allow toggling | |
| return gr.update(value=False, interactive=True), gr.update(value="", visible=False) | |
| if _model_compiled: | |
| # Already compiled - just show status | |
| return gr.update(value=True, interactive=True), gr.update(value="✓ Model already compiled", visible=True) | |
| # Need to compile - disable checkbox temporarily and show status | |
| return gr.update(value=True, interactive=False), gr.update(value="⏳ Compiling... (1-3 minutes)", visible=True) | |
| def do_compile(): | |
| """Actually perform the compilation by creating a separate compiled model.""" | |
| global model, model_compiled, _model_compiled | |
| # Skip if on Zero GPU | |
| if IS_ZEROGPU: | |
| return gr.update(value="⚠️ Compile disabled on Zero GPU", visible=True), gr.update(interactive=False) | |
| if _model_compiled: | |
| return gr.update(value="", visible=False), gr.update(interactive=True) | |
| try: | |
| # Load models first if not already loaded (needed for compilation) | |
| # Since Zero GPU can't compile, we can safely load eagerly here | |
| load_models() | |
| # print("Compiling model... This will take 1-3 minutes on first run.") | |
| # print("Creating a separate compiled model for toggling...") | |
| # Create a compiled version of the model | |
| model_compiled = torch.compile(model) | |
| model_compiled.get_kv_cache = torch.compile(model.get_kv_cache) | |
| model_compiled.get_kv_cache_from_precomputed_speaker_state = torch.compile(model.get_kv_cache_from_precomputed_speaker_state) | |
| _model_compiled = True | |
| # print("Compilation complete! You can now toggle between compiled/uncompiled.") | |
| return gr.update(value="", visible=False), gr.update(interactive=True) | |
| except Exception as e: | |
| print(f"Compilation failed: {str(e)}") | |
| return gr.update(value=f"✗ Compilation failed: {str(e)}", visible=True), gr.update(interactive=True) | |
| def save_audio_with_format(audio_tensor: torch.Tensor, base_path: Path, filename: str, sample_rate: int, audio_format: str) -> Path: | |
| """Save audio in specified format, fallback to WAV if MP3 encoding fails.""" | |
| if audio_format == "mp3": | |
| try: | |
| output_path = base_path / f"{filename}.mp3" | |
| # Try to save as MP3 | |
| torchaudio.save( | |
| str(output_path), | |
| audio_tensor, | |
| sample_rate, | |
| format="mp3", | |
| encoding="mp3", | |
| bits_per_sample=None | |
| ) | |
| # print(f"Successfully saved as MP3: {output_path}") | |
| return output_path | |
| except Exception as e: | |
| print(f"MP3 encoding failed: {e}, falling back to WAV") | |
| # Fallback to WAV | |
| output_path = base_path / f"{filename}.wav" | |
| if USE_16_BIT_WAV: | |
| torchaudio.save(str(output_path), audio_tensor, sample_rate, encoding="PCM_S", bits_per_sample=16) | |
| else: | |
| torchaudio.save(str(output_path), audio_tensor, sample_rate) | |
| return output_path | |
| else: | |
| # Save as WAV | |
| output_path = base_path / f"{filename}.wav" | |
| if USE_16_BIT_WAV: | |
| torchaudio.save(str(output_path), audio_tensor, sample_rate, encoding="PCM_S", bits_per_sample=16) | |
| else: | |
| torchaudio.save(str(output_path), audio_tensor, sample_rate) | |
| return output_path | |
| def generate_audio( | |
| text_prompt: str, | |
| speaker_st_path: str, | |
| speaker_audio_path: str, | |
| # Sampling parameters | |
| num_steps: int, | |
| rng_seed: int, | |
| cfg_mode: str, | |
| cfg_scale_text: float, | |
| cfg_scale_speaker: float, | |
| cfg_min_t: float, | |
| cfg_max_t: float, | |
| truncation_factor: float, | |
| rescale_k: float, | |
| rescale_sigma: float, | |
| speaker_k_enable: bool, | |
| speaker_k_scale: float, | |
| speaker_k_min_t: float, | |
| speaker_k_max_layers: int, | |
| apg_eta_text: float, | |
| apg_eta_speaker: float, | |
| apg_momentum_text: float, | |
| apg_momentum_speaker: float, | |
| apg_norm_text: str, | |
| apg_norm_speaker: str, | |
| reconstruct_first_30_seconds: bool, | |
| use_custom_shapes: bool, | |
| max_text_byte_length: str, | |
| max_speaker_latent_length: str, | |
| sample_latent_len: str, | |
| audio_format: str, | |
| use_compile: bool, | |
| show_original_audio: bool, | |
| session_id: str, | |
| ) -> Tuple[Any, Any, Any, Any, Any, Any, Any, Any]: | |
| """Generate audio using the model from the notebook.""" | |
| # Load models on first use (required for Zero GPU) | |
| load_models() | |
| # Choose which model to use based on compile setting | |
| global model, model_compiled | |
| active_model = model_compiled if (use_compile and model_compiled is not None) else model | |
| if use_compile and model_compiled is None: | |
| print("Warning: Compile requested but model not yet compiled. Using uncompiled model.") | |
| # Cleanup old temp files globally and remove ALL previous files for this user | |
| cleanup_temp_audio(TEMP_AUDIO_DIR, session_id) | |
| # Check if speaker is provided (now optional for zero conditioning) | |
| use_zero_speaker = not speaker_audio_path or speaker_audio_path == "" | |
| if use_zero_speaker: | |
| speaker_audio_path = None | |
| start_time = time.time() | |
| # Parse parameters (most are already numeric from gr.Number) | |
| num_steps_int = min(max(int(num_steps), 1), 80) # Clamp to [1, 80] | |
| rng_seed_int = int(rng_seed) if rng_seed is not None else 0 | |
| cfg_scale_text_val = float(cfg_scale_text) | |
| cfg_min_t_val = float(cfg_min_t) | |
| cfg_max_t_val = float(cfg_max_t) | |
| truncation_factor_val = float(truncation_factor) | |
| rescale_k_val = float(rescale_k) if rescale_k != 1.0 else None # 1.0 means "off" | |
| rescale_sigma_val = float(rescale_sigma) | |
| # Determine guidance mode | |
| if cfg_mode == "independent": | |
| guidance_mode = GuidanceMode.INDEPENDENT | |
| cfg_scale_speaker_val = float(cfg_scale_speaker) if cfg_scale_speaker is not None else None | |
| apg_eta_text_val = None | |
| apg_eta_speaker_val = None | |
| apg_momentum_text_val = None | |
| apg_momentum_speaker_val = None | |
| apg_norm_text_val = None | |
| apg_norm_speaker_val = None | |
| elif cfg_mode == "alternating": | |
| guidance_mode = GuidanceMode.ALTERNATING | |
| cfg_scale_speaker_val = float(cfg_scale_speaker) if cfg_scale_speaker is not None else None | |
| apg_eta_text_val = None | |
| apg_eta_speaker_val = None | |
| apg_momentum_text_val = None | |
| apg_momentum_speaker_val = None | |
| apg_norm_text_val = None | |
| apg_norm_speaker_val = None | |
| elif cfg_mode == "apg-independent": | |
| guidance_mode = GuidanceMode.APG | |
| cfg_scale_speaker_val = float(cfg_scale_speaker) if cfg_scale_speaker is not None else None | |
| apg_eta_text_val = float(apg_eta_text) if apg_eta_text is not None else None | |
| apg_eta_speaker_val = float(apg_eta_speaker) if apg_eta_speaker is not None else None | |
| apg_momentum_text_val = float(apg_momentum_text) if apg_momentum_text is not None else None | |
| apg_momentum_speaker_val = float(apg_momentum_speaker) if apg_momentum_speaker is not None else None | |
| apg_norm_text_val = float(apg_norm_text) if apg_norm_text.strip() else None | |
| apg_norm_speaker_val = float(apg_norm_speaker) if apg_norm_speaker.strip() else None | |
| else: # "joint-unconditional" | |
| guidance_mode = GuidanceMode.JOINT | |
| # For unconditional, speaker scale must be None | |
| cfg_scale_speaker_val = None | |
| apg_eta_text_val = None | |
| apg_eta_speaker_val = None | |
| apg_momentum_text_val = None | |
| apg_momentum_speaker_val = None | |
| apg_norm_text_val = None | |
| apg_norm_speaker_val = None | |
| # Parse speaker K scale parameters (available for all modes) | |
| if speaker_k_enable: | |
| speaker_k_scale_val = float(speaker_k_scale) if speaker_k_scale is not None else None | |
| speaker_k_min_t_val = float(speaker_k_min_t) if speaker_k_min_t is not None else None | |
| speaker_k_max_layers_val = int(speaker_k_max_layers) if speaker_k_max_layers is not None else None | |
| else: | |
| speaker_k_scale_val = None | |
| speaker_k_min_t_val = None | |
| speaker_k_max_layers_val = None | |
| # Parse custom shapes if enabled | |
| if use_custom_shapes: | |
| # Allow blank/empty values for first two fields (will use None) | |
| pad_to_max_text_seq_len = int(max_text_byte_length) if max_text_byte_length.strip() else None | |
| pad_to_max_speaker_latent_len = int(max_speaker_latent_length) if max_speaker_latent_length.strip() else None | |
| sample_latent_len_val = int(sample_latent_len) if sample_latent_len.strip() else 640 | |
| else: | |
| pad_to_max_text_seq_len = 768 | |
| pad_to_max_speaker_latent_len = 2560 | |
| sample_latent_len_val = 640 | |
| # Create sample function with parameters | |
| sample_fn = partial( | |
| sample_euler_cfg_any, | |
| num_steps=num_steps_int, | |
| guidance_mode=guidance_mode, | |
| cfg_scale_text=cfg_scale_text_val, | |
| cfg_scale_speaker=cfg_scale_speaker_val, | |
| cfg_min_t=cfg_min_t_val, | |
| cfg_max_t=cfg_max_t_val, | |
| truncation_factor=truncation_factor_val, | |
| rescale_k=rescale_k_val, | |
| rescale_sigma=rescale_sigma_val, | |
| speaker_k_scale=speaker_k_scale_val, | |
| speaker_k_min_t=speaker_k_min_t_val, | |
| speaker_k_max_layers=speaker_k_max_layers_val, | |
| apg_eta_text=apg_eta_text_val, | |
| apg_eta_speaker=apg_eta_speaker_val, | |
| apg_momentum_text=apg_momentum_text_val, | |
| apg_momentum_speaker=apg_momentum_speaker_val, | |
| apg_norm_text=apg_norm_text_val, | |
| apg_norm_speaker=apg_norm_speaker_val, | |
| block_size=sample_latent_len_val | |
| ) | |
| # Load speaker audio if provided | |
| if speaker_audio_path is not None: | |
| speaker_audio = load_audio(speaker_audio_path).cuda() | |
| else: | |
| speaker_audio = None | |
| # Generate audio using raw audio (with selected model - compiled or not) | |
| audio_out = sample_pipeline( | |
| model=active_model, | |
| fish_ae=fish_ae, | |
| pca_state=pca_state, | |
| sample_fn=sample_fn, | |
| text_prompt=text_prompt, | |
| speaker_audio=speaker_audio, | |
| rng_seed=rng_seed_int, | |
| pad_to_max_text_seq_len=pad_to_max_text_seq_len, | |
| pad_to_max_speaker_latent_len=pad_to_max_speaker_latent_len, | |
| ) | |
| # Apply silentcipher watermarking if enabled | |
| audio_to_save = audio_out[0].cpu() | |
| if USE_SILENTCIPHER and silentcipher_model is not None: | |
| try: | |
| # print("Applying silentcipher watermark...") | |
| audio_numpy = audio_to_save.squeeze(0).numpy() | |
| encoded_audio, sdr = silentcipher_model.encode_wav( | |
| audio_numpy, | |
| 44100, | |
| SILENTCIPHER_MESSAGE, | |
| message_sdr=SILENTCIPHER_SDR | |
| ) | |
| audio_to_save = torch.tensor(encoded_audio).unsqueeze(0) | |
| # print(f"Watermark applied successfully! SDR: {sdr:.2f} dB") | |
| except Exception as e: | |
| print(f"Warning: Watermarking failed: {e}") | |
| print("Saving audio without watermark...") | |
| # Save generated audio with format selection (unique filename per session) | |
| stem = make_stem("generated", session_id) | |
| output_path = save_audio_with_format( | |
| audio_to_save, | |
| TEMP_AUDIO_DIR, | |
| stem, | |
| 44100, | |
| audio_format | |
| ) | |
| # Calculate generation time | |
| generation_time = time.time() - start_time | |
| time_str = f"⏱️ Total generation time: {generation_time:.2f}s" | |
| # Format text prompt for display | |
| text_display = f"**Text Prompt:**\n\n{text_prompt}" | |
| # Prepare reconstruction and original audio based on checkboxes | |
| recon_output_path = None | |
| original_output_path = None | |
| # Optionally reconstruct first 30 seconds for reference | |
| if reconstruct_first_30_seconds and speaker_audio_path: | |
| audio_recon = ae_reconstruct( | |
| fish_ae=fish_ae, | |
| pca_state=pca_state, | |
| audio=torch.nn.functional.pad( | |
| speaker_audio[..., :2048 * 640], | |
| (0, max(0, 2048 * 640 - speaker_audio.shape[-1])) | |
| )[None], | |
| )[..., :speaker_audio.shape[-1]] | |
| # Save reconstruction with same format (unique filename per session) | |
| recon_stem = make_stem("speaker_recon", session_id) | |
| recon_output_path = save_audio_with_format( | |
| audio_recon.cpu()[0], | |
| TEMP_AUDIO_DIR, | |
| recon_stem, | |
| 44100, | |
| audio_format | |
| ) | |
| # Optionally show original audio (2-minute cropped mono) | |
| if show_original_audio and speaker_audio_path: | |
| # Save original audio with same format (unique filename per session) | |
| original_stem = make_stem("original_audio", session_id) | |
| original_output_path = save_audio_with_format( | |
| speaker_audio.cpu(), | |
| TEMP_AUDIO_DIR, | |
| original_stem, | |
| 44100, | |
| audio_format | |
| ) | |
| # Return results with visibility control for accordions | |
| show_reference_section = (show_original_audio or reconstruct_first_30_seconds) and speaker_audio_path is not None | |
| return ( | |
| gr.update(), | |
| gr.update(value=str(output_path), visible=True), | |
| gr.update(value=text_display, visible=True), | |
| gr.update(value=str(original_output_path) if original_output_path else None, visible=True), | |
| gr.update(value=time_str, visible=True), | |
| gr.update(value=str(recon_output_path) if recon_output_path else None, visible=True), | |
| gr.update(visible=(show_original_audio and speaker_audio_path is not None)), # original_accordion visibility | |
| gr.update(visible=(reconstruct_first_30_seconds and speaker_audio_path is not None)), # reference_accordion visibility | |
| gr.update(visible=show_reference_section) # reference_audio_header visibility | |
| ) | |
| # UI Helper Functions | |
| def load_speaker_metadata(speaker_id): | |
| """Load metadata for a speaker from any of their voice folders.""" | |
| if not EARS_PATH.exists(): | |
| return None | |
| # Find any subfolder for this speaker and load its metadata | |
| for subdir in EARS_PATH.iterdir(): | |
| if subdir.is_dir() and subdir.name.startswith(f"{speaker_id}_"): | |
| metadata_path = subdir / "metadata.json" | |
| if metadata_path.exists(): | |
| try: | |
| with open(metadata_path, 'r') as f: | |
| data = json.load(f) | |
| return data.get("speaker_metadata", {}) | |
| except Exception: | |
| continue | |
| return None | |
| def get_speakers(): | |
| """Get list of unique speakers with their metadata.""" | |
| if not EARS_PATH.exists(): | |
| return [] | |
| speakers_dict = {} | |
| for subdir in sorted(EARS_PATH.iterdir()): | |
| if subdir.is_dir(): | |
| # Extract speaker ID (pXXX) | |
| name = subdir.name | |
| if name.startswith('p') and '_' in name: | |
| speaker_id = name.split('_')[0] | |
| if speaker_id not in speakers_dict: | |
| speakers_dict[speaker_id] = None | |
| # Load metadata for each speaker | |
| speakers_with_metadata = [] | |
| for speaker_id in sorted(speakers_dict.keys()): | |
| metadata = load_speaker_metadata(speaker_id) | |
| if metadata: | |
| speakers_with_metadata.append({ | |
| 'id': speaker_id, | |
| 'gender': metadata.get('gender', 'unknown'), | |
| 'age': metadata.get('age', 'unknown'), | |
| 'ethnicity': metadata.get('ethnicity', 'unknown'), | |
| 'native_language': metadata.get('native language', 'unknown'), | |
| }) | |
| else: | |
| speakers_with_metadata.append({ | |
| 'id': speaker_id, | |
| 'gender': 'unknown', | |
| 'age': 'unknown', | |
| 'ethnicity': 'unknown', | |
| 'native_language': 'unknown', | |
| }) | |
| return speakers_with_metadata | |
| def get_speakers_table(search_query=""): | |
| """Get speakers as table data for Gradio, optionally filtered by search query.""" | |
| speakers = get_speakers() | |
| result = [] | |
| for s in speakers: | |
| # Abbreviate gender | |
| gender = s['gender'] | |
| if gender.lower() == 'male': | |
| gender = 'M' | |
| elif gender.lower() == 'female': | |
| gender = 'F' | |
| else: | |
| gender = gender[0].upper() if gender else '?' | |
| # Apply search filter if provided | |
| if search_query: | |
| search_lower = search_query.lower() | |
| searchable_text = f"{s['id']} {gender} {s['age']} {s['ethnicity']} {s['native_language']}".lower() | |
| if search_lower not in searchable_text: | |
| continue | |
| result.append([s['id'], gender, s['age'], s['ethnicity'], s['native_language']]) | |
| return result | |
| def get_audio_length_from_metadata(voice_dir): | |
| """Get audio length from metadata.json file.""" | |
| metadata_path = voice_dir / "metadata.json" | |
| if metadata_path.exists(): | |
| try: | |
| with open(metadata_path, 'r') as f: | |
| data = json.load(f) | |
| length = data.get("audio_length_seconds", 0) | |
| return f"{length:.1f}s" | |
| except Exception: | |
| return "N/A" | |
| return "N/A" | |
| def get_freeform_table(speaker_id): | |
| """Get freeform table for a speaker (single row if exists).""" | |
| if not EARS_PATH.exists() or not speaker_id: | |
| return [] | |
| freeform_dir = EARS_PATH / f"{speaker_id}_freeform" | |
| if freeform_dir.exists(): | |
| audio_path = freeform_dir / "audio.mp3" | |
| st_path = freeform_dir / "speaker_latent.safetensors" | |
| if audio_path.exists() and st_path.exists(): | |
| audio_length = get_audio_length_from_metadata(freeform_dir) | |
| return [["Freeform", audio_length]] | |
| return [] | |
| def get_emotions_for_speaker(speaker_id): | |
| """Get list of emotions with audio lengths available for a given speaker (excluding _joint_).""" | |
| if not EARS_PATH.exists() or not speaker_id: | |
| return [] | |
| emotions = [] | |
| for subdir in sorted(EARS_PATH.iterdir()): | |
| if subdir.is_dir(): | |
| name = subdir.name | |
| # Match pattern: p{speaker_id}_emo_{emotion} (but not _emo_joint_) | |
| if name.startswith(f"{speaker_id}_emo_") and "_joint_" not in name: | |
| # Extract emotion part | |
| parts = name.split('_emo_') | |
| if len(parts) == 2: | |
| emotion = parts[1] | |
| # Verify files exist | |
| audio_path = subdir / "audio.mp3" | |
| st_path = subdir / "speaker_latent.safetensors" | |
| if audio_path.exists() and st_path.exists(): | |
| audio_length = get_audio_length_from_metadata(subdir) | |
| emotions.append((emotion, audio_length)) | |
| return emotions | |
| def get_emotions_table(speaker_id): | |
| """Get emotions table for a speaker with audio lengths.""" | |
| if not speaker_id: | |
| return [] | |
| emotions = get_emotions_for_speaker(speaker_id) | |
| return [[emotion, length] for emotion, length in emotions] | |
| # VCTK Helper Functions | |
| def get_vctk_speakers(): | |
| """Get list of VCTK speakers with their metadata.""" | |
| if not VCTK_PATH.exists(): | |
| return [] | |
| speakers_with_metadata = [] | |
| for subdir in sorted(VCTK_PATH.iterdir()): | |
| if subdir.is_dir() and subdir.name.startswith('p'): | |
| speaker_id = subdir.name | |
| audio_path = subdir / "audio.mp3" | |
| st_path = subdir / "speaker_latent.safetensors" | |
| metadata_path = subdir / "metadata.json" | |
| if audio_path.exists() and st_path.exists() and metadata_path.exists(): | |
| try: | |
| with open(metadata_path, 'r') as f: | |
| data = json.load(f) | |
| speaker_info = data.get("speaker_info", {}) | |
| audio_length = data.get("total_audio_length_seconds", 0) | |
| speakers_with_metadata.append({ | |
| 'id': speaker_info.get('id', speaker_id), | |
| 'gender': speaker_info.get('gender', 'unknown'), | |
| 'age': speaker_info.get('age', 'unknown'), | |
| 'details': speaker_info.get('details', 'unknown'), | |
| 'audio_length': f"{audio_length:.1f}s" | |
| }) | |
| except Exception: | |
| continue | |
| return speakers_with_metadata | |
| def get_vctk_speakers_table(search_query=""): | |
| """Get VCTK speakers as table data for Gradio, optionally filtered by search query.""" | |
| speakers = get_vctk_speakers() | |
| result = [] | |
| for s in speakers: | |
| # Abbreviate gender | |
| gender = s['gender'] | |
| if gender.lower() == 'male' or gender == 'M': | |
| gender = 'M' | |
| elif gender.lower() == 'female' or gender == 'F': | |
| gender = 'F' | |
| else: | |
| gender = gender[0].upper() if gender else '?' | |
| # Apply search filter if provided | |
| if search_query: | |
| search_lower = search_query.lower() | |
| searchable_text = f"{s['id']} {gender} {s['age']} {s['details']} {s['audio_length']}".lower() | |
| if search_lower not in searchable_text: | |
| continue | |
| result.append([s['id'], gender, s['age'], s['details'], s['audio_length']]) | |
| return result | |
| def load_text_presets(): | |
| """Load text presets from file with category and word count.""" | |
| if TEXT_PRESETS_PATH.exists(): | |
| with open(TEXT_PRESETS_PATH, 'r', encoding='utf-8') as f: | |
| lines = [line.strip() for line in f if line.strip()] | |
| result = [] | |
| for line in lines: | |
| # Split on first " | " to separate category from text | |
| if " | " in line: | |
| parts = line.split(" | ", 1) | |
| category = parts[0] | |
| text = parts[1] | |
| else: | |
| # Fallback if no category | |
| category = "Uncategorized" | |
| text = line | |
| # Calculate word count | |
| word_count = len(text.split()) | |
| result.append([category, str(word_count), text]) | |
| return result | |
| return [] | |
| def search_speakers(search_query): | |
| """Filter speakers table based on search query.""" | |
| filtered_data = get_speakers_table(search_query) | |
| return gr.update(value=filtered_data) | |
| def select_speaker_from_table(evt: gr.SelectData, table_data): | |
| """Handle speaker selection - populate freeform and emotions tables.""" | |
| if evt.value and table_data is not None: | |
| # evt.index is a tuple/list (row, col), we need the row to get the speaker ID | |
| if isinstance(evt.index, (tuple, list)) and len(evt.index) >= 2: | |
| row_index = evt.index[0] | |
| else: | |
| row_index = evt.index | |
| # Use the actual displayed (filtered) table data (pandas DataFrame) | |
| if isinstance(row_index, int) and row_index < len(table_data): | |
| speaker_row = table_data.iloc[row_index] | |
| speaker_id = speaker_row.iloc[0] # First column is the ID | |
| # Format selection display - clean and simple | |
| gender_full = "Male" if speaker_row.iloc[1] == "M" else "Female" if speaker_row.iloc[1] == "F" else speaker_row.iloc[1] | |
| selection_text = f"Selected Speaker: {speaker_id}\n{gender_full} • {speaker_row.iloc[2]} • {speaker_row.iloc[3]}" | |
| # Get freeform and emotions data | |
| freeform_data = get_freeform_table(speaker_id) | |
| emotions_data = get_emotions_table(speaker_id) | |
| return ( | |
| gr.update(value=selection_text, visible=True), # Show speaker selection | |
| gr.update(value=freeform_data, visible=True), # Update freeform table | |
| gr.update(value=emotions_data, visible=True), # Update emotions table | |
| gr.update(value=speaker_id), # Store speaker ID | |
| gr.update(value=None), # Clear audio preview | |
| gr.update(value=""), # Clear safetensors path | |
| gr.update(value=""), # Clear audio path | |
| gr.update(value="", visible=False) # Clear voice selection display | |
| ) | |
| return ( | |
| gr.update(value="", visible=False), | |
| gr.update(value=[], visible=True), | |
| gr.update(value=[], visible=True), | |
| gr.update(value=""), | |
| gr.update(value=None), | |
| gr.update(value=""), | |
| gr.update(value=""), | |
| gr.update(value="", visible=False) | |
| ) | |
| def select_freeform_from_table(evt: gr.SelectData, speaker_id: str): | |
| """Handle freeform selection from table - load freeform voice files.""" | |
| if speaker_id: | |
| voice_name = f"{speaker_id}_freeform" | |
| voice_dir = EARS_PATH / voice_name | |
| audio_path = str(voice_dir / "audio.mp3") | |
| st_path = str(voice_dir / "speaker_latent.safetensors") | |
| if voice_dir.exists(): | |
| # Format freeform display | |
| freeform_display = f"Selected: Freeform\n{speaker_id}_freeform" | |
| return ( | |
| gr.update(value=freeform_display, visible=True), # Show freeform selection | |
| gr.update(value=audio_path), # Update audio player | |
| gr.update(value=st_path), # Update safetensors path | |
| gr.update(value=audio_path) # Update audio path for reconstruction | |
| ) | |
| return gr.update(value="", visible=False), gr.update(value=None), gr.update(value=""), gr.update(value="") | |
| def select_emotion_from_table(evt: gr.SelectData, speaker_id: str): | |
| """Handle emotion selection - load voice files.""" | |
| if evt.value and speaker_id: | |
| # evt.index is (row, col) - get the row to extract emotion from first column | |
| if isinstance(evt.index, (tuple, list)) and len(evt.index) >= 2: | |
| row_index = evt.index[0] | |
| else: | |
| row_index = 0 | |
| # Get emotions data and extract the emotion name from first column | |
| emotions_data = get_emotions_table(speaker_id) | |
| if isinstance(row_index, int) and row_index < len(emotions_data): | |
| emotion = emotions_data[row_index][0] # First column is emotion name | |
| voice_name = f"{speaker_id}_emo_{emotion}" | |
| voice_dir = EARS_PATH / voice_name | |
| audio_path = str(voice_dir / "audio.mp3") | |
| st_path = str(voice_dir / "speaker_latent.safetensors") | |
| if voice_dir.exists(): | |
| # Format emotion display - clean and simple | |
| emotion_display = f"Selected Emotion: {emotion.title()}\n{speaker_id}_emo_{emotion}" | |
| return ( | |
| gr.update(value=emotion_display, visible=True), # Show emotion selection | |
| gr.update(value=audio_path), # Update audio player | |
| gr.update(value=st_path), # Update safetensors path | |
| gr.update(value=audio_path) # Update audio path for reconstruction | |
| ) | |
| return gr.update(value="", visible=False), gr.update(value=None), gr.update(value=""), gr.update(value="") | |
| def select_vctk_speaker_from_table(evt: gr.SelectData, table_data): | |
| """Handle VCTK speaker selection - load voice files directly.""" | |
| if evt.value and table_data is not None: | |
| # evt.index is a tuple/list (row, col), we need the row to get the speaker ID | |
| if isinstance(evt.index, (tuple, list)) and len(evt.index) >= 2: | |
| row_index = evt.index[0] | |
| else: | |
| row_index = evt.index | |
| # Use the actual displayed (filtered) table data (pandas DataFrame) | |
| if isinstance(row_index, int) and row_index < len(table_data): | |
| speaker_row = table_data.iloc[row_index] | |
| speaker_id = speaker_row.iloc[0] # First column is the ID | |
| # Load voice files from VCTK | |
| voice_dir = VCTK_PATH / speaker_id | |
| audio_path = str(voice_dir / "audio.mp3") | |
| st_path = str(voice_dir / "speaker_latent.safetensors") | |
| if voice_dir.exists(): | |
| # Format selection display | |
| gender_full = "Male" if speaker_row.iloc[1] == "M" else "Female" if speaker_row.iloc[1] == "F" else speaker_row.iloc[1] | |
| selection_text = f"Selected Speaker: {speaker_id}\n{gender_full} • {speaker_row.iloc[2]} • {speaker_row.iloc[3]}" | |
| return ( | |
| gr.update(value=selection_text, visible=True), # Show speaker selection | |
| gr.update(value=speaker_id), # Store speaker ID | |
| gr.update(value=audio_path), # Update audio player | |
| gr.update(value=st_path), # Update safetensors path | |
| gr.update(value=audio_path) # Update audio path for reconstruction | |
| ) | |
| return ( | |
| gr.update(value="", visible=False), | |
| gr.update(value=""), | |
| gr.update(value=None), | |
| gr.update(value=""), | |
| gr.update(value="") | |
| ) | |
| def search_vctk_speakers(search_query): | |
| """Filter VCTK speakers table based on search query.""" | |
| filtered_data = get_vctk_speakers_table(search_query) | |
| return gr.update(value=filtered_data) | |
| # Expresso Helper Functions | |
| def get_expresso_speakers(): | |
| """Get list of all Expresso speakers with their metadata.""" | |
| if not EXPRESSO_PATH.exists(): | |
| return [] | |
| speakers_with_metadata = [] | |
| for subdir in sorted(EXPRESSO_PATH.iterdir()): | |
| if subdir.is_dir() and subdir.name.startswith('expresso_'): | |
| speaker_id = subdir.name | |
| audio_path = subdir / "audio.mp3" | |
| st_path = subdir / "speaker_latent.safetensors" | |
| metadata_path = subdir / "metadata.json" | |
| if audio_path.exists() and st_path.exists() and metadata_path.exists(): | |
| try: | |
| with open(metadata_path, 'r') as f: | |
| data = json.load(f) | |
| audio_length = data.get("audio_length_seconds", 0) | |
| speakers_with_metadata.append({ | |
| 'id': speaker_id, | |
| 'type': data.get('type', 'unknown'), | |
| 'speakers': data.get('speakers', 'unknown'), | |
| 'style': data.get('style', 'unknown'), | |
| 'audio_length': f"{audio_length:.1f}s" | |
| }) | |
| except Exception: | |
| continue | |
| return speakers_with_metadata | |
| def get_expresso_speakers_table(search_query=""): | |
| """Get Expresso speakers as table data for Gradio, optionally filtered by search query.""" | |
| speakers = get_expresso_speakers() | |
| result = [] | |
| for s in speakers: | |
| # Apply search filter if provided | |
| if search_query: | |
| search_lower = search_query.lower() | |
| # Search in all fields | |
| if not any(search_lower in str(v).lower() for v in [s['id'], s['type'], s['speakers'], s['style']]): | |
| continue | |
| result.append([ | |
| s['id'], | |
| s['type'], | |
| s['speakers'], | |
| s['style'], | |
| s['audio_length'] | |
| ]) | |
| return result | |
| def select_expresso_speaker_from_table(evt: gr.SelectData, table_data): | |
| """Handle Expresso speaker selection - load voice files directly.""" | |
| if evt.value and table_data is not None: | |
| # evt.index is a tuple/list (row, col), we need the row to get the speaker ID | |
| if isinstance(evt.index, (tuple, list)) and len(evt.index) >= 2: | |
| row_index = evt.index[0] | |
| else: | |
| row_index = evt.index | |
| # Use the actual displayed (filtered) table data (pandas DataFrame) | |
| if isinstance(row_index, int) and row_index < len(table_data): | |
| speaker_row = table_data.iloc[row_index] | |
| speaker_id = speaker_row.iloc[0] # First column is the ID | |
| # Load voice files from Expresso | |
| voice_dir = EXPRESSO_PATH / speaker_id | |
| audio_path = str(voice_dir / "audio.mp3") | |
| st_path = str(voice_dir / "speaker_latent.safetensors") | |
| if voice_dir.exists(): | |
| # Format selection display | |
| selection_text = f"Selected Voice: {speaker_id}\nType: {speaker_row.iloc[1]} • Speakers: {speaker_row.iloc[2]} • Style: {speaker_row.iloc[3]}" | |
| return ( | |
| gr.update(value=selection_text, visible=True), # Show speaker selection | |
| gr.update(value=speaker_id), # Store speaker ID | |
| gr.update(value=audio_path), # Update audio player | |
| gr.update(value=st_path), # Update safetensors path | |
| gr.update(value=audio_path) # Update audio path for reconstruction | |
| ) | |
| return ( | |
| gr.update(value="", visible=False), | |
| gr.update(value=""), | |
| gr.update(value=None), | |
| gr.update(value=""), | |
| gr.update(value="") | |
| ) | |
| def search_expresso_speakers(search_query): | |
| """Filter Expresso speakers table based on search query.""" | |
| filtered_data = get_expresso_speakers_table(search_query) | |
| return gr.update(value=filtered_data) | |
| # HF-Custom Helper Functions | |
| def get_hf_custom_speakers(): | |
| """Get list of all HF-Custom speakers with their metadata.""" | |
| if not HF_CUSTOM_PATH.exists(): | |
| return [] | |
| speakers_with_metadata = [] | |
| for subdir in sorted(HF_CUSTOM_PATH.iterdir()): | |
| if subdir.is_dir(): | |
| speaker_name = subdir.name | |
| audio_path = subdir / "audio.mp3" | |
| st_path = subdir / "speaker_latent.safetensors" | |
| metadata_path = subdir / "metadata.json" | |
| if audio_path.exists() and st_path.exists() and metadata_path.exists(): | |
| try: | |
| with open(metadata_path, 'r') as f: | |
| data = json.load(f) | |
| audio_length = data.get("audio_duration_seconds", 0) | |
| speakers_with_metadata.append({ | |
| 'name': data.get('speaker_name', speaker_name), | |
| 'dataset': data.get('dataset_name', ''), | |
| 'description': data.get('speaker_description', ''), | |
| 'audio_length': f"{audio_length:.1f}s" | |
| }) | |
| except Exception: | |
| continue | |
| return speakers_with_metadata | |
| def get_hf_custom_speakers_table(search_query=""): | |
| """Get HF-Custom speakers as table data for Gradio, optionally filtered by search query.""" | |
| speakers = get_hf_custom_speakers() | |
| result = [] | |
| for s in speakers: | |
| # Apply search filter if provided | |
| if search_query: | |
| search_lower = search_query.lower() | |
| # Search in all fields | |
| if not any(search_lower in str(v).lower() for v in [s['name'], s['dataset'], s['description']]): | |
| continue | |
| result.append([ | |
| s['name'], | |
| s['dataset'], | |
| s['description'], | |
| s['audio_length'] | |
| ]) | |
| return result | |
| def select_hf_custom_speaker_from_table(evt: gr.SelectData, table_data): | |
| """Handle HF-Custom speaker selection - load voice files directly.""" | |
| if evt.value and table_data is not None: | |
| # evt.index is a tuple/list (row, col), we need the row to get the speaker name | |
| if isinstance(evt.index, (tuple, list)) and len(evt.index) >= 2: | |
| row_index = evt.index[0] | |
| else: | |
| row_index = evt.index | |
| # Use the actual displayed (filtered) table data (pandas DataFrame) | |
| if isinstance(row_index, int) and row_index < len(table_data): | |
| speaker_row = table_data.iloc[row_index] | |
| speaker_name = speaker_row.iloc[0] # First column is the name | |
| # Load voice files from HF-Custom | |
| voice_dir = HF_CUSTOM_PATH / speaker_name | |
| audio_path = str(voice_dir / "audio.mp3") | |
| st_path = str(voice_dir / "speaker_latent.safetensors") | |
| if voice_dir.exists(): | |
| # Format selection display | |
| dataset_info = f" • {speaker_row.iloc[1]}" if speaker_row.iloc[1] else "" | |
| selection_text = f"Selected Voice: {speaker_name}{dataset_info}\n{speaker_row.iloc[2]}" | |
| return ( | |
| gr.update(value=selection_text, visible=True), # Show speaker selection | |
| gr.update(value=speaker_name), # Store speaker name | |
| gr.update(value=audio_path), # Update audio player | |
| gr.update(value=st_path), # Update safetensors path | |
| gr.update(value=audio_path) # Update audio path for reconstruction | |
| ) | |
| return ( | |
| gr.update(value="", visible=False), | |
| gr.update(value=""), | |
| gr.update(value=None), | |
| gr.update(value=""), | |
| gr.update(value="") | |
| ) | |
| def search_hf_custom_speakers(search_query): | |
| """Filter HF-Custom speakers table based on search query.""" | |
| filtered_data = get_hf_custom_speakers_table(search_query) | |
| return gr.update(value=filtered_data) | |
| # Audio Prompt Library functions | |
| AUDIO_EXTS = {".wav", ".mp3", ".m4a", ".ogg", ".flac", ".webm", ".aac", ".opus"} | |
| def get_audio_prompt_files(): | |
| """Get list of audio files from the audio prompt folder.""" | |
| if AUDIO_PROMPT_FOLDER is None or not AUDIO_PROMPT_FOLDER.exists(): | |
| return [] | |
| files = sorted([ | |
| f.name for f in AUDIO_PROMPT_FOLDER.iterdir() | |
| if f.is_file() and f.suffix.lower() in AUDIO_EXTS | |
| ], key=str.lower) | |
| return [[file] for file in files] | |
| def select_audio_prompt_file(evt: gr.SelectData): | |
| """Handle audio prompt file selection from table.""" | |
| if evt.value and AUDIO_PROMPT_FOLDER is not None: | |
| file_path = AUDIO_PROMPT_FOLDER / evt.value | |
| if file_path.exists(): | |
| return gr.update(value=str(file_path)) | |
| return gr.update() | |
| def switch_dataset(dataset_name): | |
| """Switch between Custom Audio Panel, EARS, VCTK, Expresso, and HF-Custom datasets.""" | |
| if dataset_name == "Custom Audio Panel": | |
| # Show Custom Audio Panel only, hide all voicebank UI | |
| return ( | |
| gr.update(value="", visible=False), # dataset_license_info | |
| gr.update(visible=True), # custom_audio_row | |
| gr.update(visible=False), # voicebank_row | |
| gr.update(visible=False), # voice_type_column | |
| gr.update(visible=True), # ears_column (within voicebank_row) | |
| gr.update(visible=False), # vctk_column | |
| gr.update(visible=False), # expresso_column | |
| gr.update(visible=False), # hf_custom_column | |
| # Clear selections | |
| gr.update(value="", visible=False), # selected_speaker_display | |
| gr.update(value=[]), # freeform_table | |
| gr.update(value=[]), # emotions_table | |
| gr.update(value="", visible=False), # selected_voice_display | |
| gr.update(value="", visible=False), # vctk_speaker_display | |
| gr.update(value="", visible=False), # expresso_speaker_display | |
| gr.update(value="", visible=False), # hf_custom_speaker_display | |
| gr.update(value=""), # selected_speaker_state | |
| gr.update(value=None), # audio_preview | |
| gr.update(value=""), # speaker_st_path_state | |
| gr.update(value="") # speaker_audio_path_state | |
| ) | |
| elif dataset_name == "EARS": | |
| # Show EARS UI, hide others, show Voice Type column | |
| license_text = "**EARS Dataset License:** Creative Commons Attribution 4.0 International ([CC-BY-4.0](https://creativecommons.org/licenses/by/4.0/))" | |
| return ( | |
| gr.update(value=license_text, visible=True), # dataset_license_info | |
| gr.update(visible=False), # custom_audio_row | |
| gr.update(visible=True), # voicebank_row | |
| gr.update(visible=True), # voice_type_column (show for EARS) | |
| gr.update(visible=True), # ears_column | |
| gr.update(visible=False), # vctk_column | |
| gr.update(visible=False), # expresso_column | |
| gr.update(visible=False), # hf_custom_column | |
| gr.update(value=""), # selected_speaker_display | |
| gr.update(value=[], visible=True), # freeform_table | |
| gr.update(value=[], visible=True), # emotions_table | |
| gr.update(value="", visible=False), # selected_voice_display | |
| gr.update(value="", visible=False), # vctk_speaker_display | |
| gr.update(value="", visible=False), # expresso_speaker_display | |
| gr.update(value="", visible=False), # hf_custom_speaker_display | |
| gr.update(value=""), # selected_speaker_state | |
| gr.update(value=None), # audio_preview | |
| gr.update(value=""), # speaker_st_path_state | |
| gr.update(value="") # speaker_audio_path_state | |
| ) | |
| elif dataset_name == "VCTK": | |
| # Show VCTK UI, hide others, hide Voice Type column | |
| license_text = "**VCTK Dataset License:** Creative Commons Attribution 4.0 International ([CC-BY-4.0](https://creativecommons.org/licenses/by/4.0/))" | |
| return ( | |
| gr.update(value=license_text, visible=True), # dataset_license_info | |
| gr.update(visible=False), # custom_audio_row | |
| gr.update(visible=True), # voicebank_row | |
| gr.update(visible=False), # voice_type_column | |
| gr.update(visible=False), # ears_column | |
| gr.update(visible=True), # vctk_column | |
| gr.update(visible=False), # expresso_column | |
| gr.update(visible=False), # hf_custom_column (hide for VCTK) | |
| gr.update(value=""), # selected_speaker_display | |
| gr.update(value=[], visible=True), # freeform_table | |
| gr.update(value=[], visible=True), # emotions_table | |
| gr.update(value="", visible=False), # selected_voice_display | |
| gr.update(value="", visible=False), # vctk_speaker_display | |
| gr.update(value="", visible=False), # expresso_speaker_display | |
| gr.update(value="", visible=False), # hf_custom_speaker_display | |
| gr.update(value=""), # selected_speaker_state | |
| gr.update(value=None), # audio_preview | |
| gr.update(value=""), # speaker_st_path_state | |
| gr.update(value="") # speaker_audio_path_state | |
| ) | |
| elif dataset_name == "Expresso": | |
| # Show Expresso UI, hide others, hide Voice Type column | |
| license_text = "**Expresso Dataset License:** Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International ([CC-BY-NC-SA-4.0](https://creativecommons.org/licenses/by-nc-sa/4.0/))" | |
| return ( | |
| gr.update(value=license_text, visible=True), # dataset_license_info | |
| gr.update(visible=False), # custom_audio_row | |
| gr.update(visible=True), # voicebank_row | |
| gr.update(visible=False), # voice_type_column | |
| gr.update(visible=False), # ears_column | |
| gr.update(visible=False), # vctk_column | |
| gr.update(visible=True), # expresso_column | |
| gr.update(visible=False), # hf_custom_column (hide for Expresso) | |
| gr.update(value=""), # selected_speaker_display | |
| gr.update(value=[], visible=True), # freeform_table | |
| gr.update(value=[], visible=True), # emotions_table | |
| gr.update(value="", visible=False), # selected_voice_display | |
| gr.update(value="", visible=False), # vctk_speaker_display | |
| gr.update(value="", visible=False), # expresso_speaker_display | |
| gr.update(value="", visible=False), # hf_custom_speaker_display | |
| gr.update(value=""), # selected_speaker_state | |
| gr.update(value=None), # audio_preview | |
| gr.update(value=""), # speaker_st_path_state | |
| gr.update(value="") # speaker_audio_path_state | |
| ) | |
| else: # HF-Custom | |
| # Show HF-Custom UI, hide others, hide Voice Type column | |
| license_text = "**HF-Custom Voices:** Available in dataset cache (information in metadata.json per voice). Also view dataset at [jordand/echo-embeddings-custom](https://huggingface.co/datasets/jordand/echo-embeddings-custom)" | |
| return ( | |
| gr.update(value=license_text, visible=True), # dataset_license_info | |
| gr.update(visible=False), # custom_audio_row | |
| gr.update(visible=True), # voicebank_row | |
| gr.update(visible=False), # voice_type_column | |
| gr.update(visible=False), # ears_column | |
| gr.update(visible=False), # vctk_column | |
| gr.update(visible=False), # expresso_column | |
| gr.update(visible=True), # hf_custom_column | |
| gr.update(value=""), # selected_speaker_display | |
| gr.update(value=[], visible=True), # freeform_table | |
| gr.update(value=[], visible=True), # emotions_table | |
| gr.update(value="", visible=False), # selected_voice_display | |
| gr.update(value="", visible=False), # vctk_speaker_display | |
| gr.update(value="", visible=False), # expresso_speaker_display | |
| gr.update(value="", visible=False), # hf_custom_speaker_display | |
| gr.update(value=""), # selected_speaker_state | |
| gr.update(value=None), # audio_preview | |
| gr.update(value=""), # speaker_st_path_state | |
| gr.update(value="") # speaker_audio_path_state | |
| ) | |
| def select_text_preset(evt: gr.SelectData): | |
| """Handle text preset selection - extract text from the row.""" | |
| if evt.value: | |
| # Get the row index from the selected cell | |
| if isinstance(evt.index, (tuple, list)) and len(evt.index) >= 2: | |
| row_index = evt.index[0] | |
| else: | |
| row_index = evt.index | |
| # Get all presets and extract the text (column 2) from the selected row | |
| presets_data = load_text_presets() | |
| if isinstance(row_index, int) and row_index < len(presets_data): | |
| text = presets_data[row_index][2] # Column 2 is the text | |
| return gr.update(value=text) | |
| return gr.update() | |
| def update_cfg_visibility(cfg_mode): | |
| """Update visibility of CFG parameters based on selected mode.""" | |
| if cfg_mode == "joint-unconditional": | |
| return ( | |
| gr.update(label="Text/Speaker CFG Scale", info="Guidance strength for text and speaker (joint)"), | |
| gr.update(visible=False), | |
| gr.update(visible=False) | |
| ) | |
| elif cfg_mode == "apg-independent": | |
| return ( | |
| gr.update(label="Text CFG Scale", info="Guidance strength for text"), | |
| gr.update(visible=True), | |
| gr.update(visible=True) | |
| ) | |
| else: # independent or alternating | |
| return ( | |
| gr.update(label="Text CFG Scale", info="Guidance strength for text"), | |
| gr.update(visible=True), | |
| gr.update(visible=False) | |
| ) | |
| def toggle_speaker_k_fields(enabled): | |
| """Toggle visibility of speaker K row. Hidden components preserve their values automatically.""" | |
| return gr.update(visible=enabled) | |
| def toggle_custom_shapes_fields(enabled): | |
| """Toggle visibility of custom shapes row and reset to defaults if disabled.""" | |
| if enabled: | |
| return gr.update(visible=True) | |
| else: | |
| # When disabled, hide the row and reset fields to defaults | |
| return gr.update(visible=False) | |
| def toggle_mode(mode, speaker_k_enable_val, speaker_kv_simple_val): | |
| """Toggle between simple and advanced modes and sync speaker KV state.""" | |
| if mode == "Simple Mode": | |
| # Sync simple checkbox with advanced mode's speaker_k_enable value | |
| return ( | |
| gr.update(visible=True), # simple_mode_row (speaker KV checkbox) | |
| gr.update(visible=False), # advanced_mode_compile_column | |
| gr.update(visible=False), # advanced_mode_column (all other parameters) | |
| gr.update(value=speaker_k_enable_val), # sync simple checkbox with advanced | |
| gr.update(value=speaker_k_enable_val), # also update speaker_k_enable (keep same) | |
| ) | |
| else: # Advanced Mode | |
| # Sync advanced mode's speaker_k_enable with simple checkbox value | |
| return ( | |
| gr.update(visible=False), # simple_mode_row (speaker KV checkbox) | |
| gr.update(visible=True), # advanced_mode_compile_column | |
| gr.update(visible=True), # advanced_mode_column (all other parameters) | |
| gr.update(value=speaker_kv_simple_val), # sync simple checkbox (keep same) | |
| gr.update(value=speaker_kv_simple_val), # sync advanced with simple checkbox | |
| ) | |
| def sync_simple_to_advanced(simple_enabled): | |
| """Sync simple mode speaker KV checkbox to advanced mode controls.""" | |
| if simple_enabled: | |
| return ( | |
| gr.update(value=True), # speaker_k_enable | |
| gr.update(visible=True), # speaker_k_row | |
| gr.update(value=1.5), # speaker_k_scale | |
| gr.update(value=0.9), # speaker_k_min_t | |
| gr.update(value=24), # speaker_k_max_layers | |
| ) | |
| else: | |
| return ( | |
| gr.update(value=False), # speaker_k_enable | |
| gr.update(visible=False), # speaker_k_row | |
| gr.update(), # speaker_k_scale (no change) | |
| gr.update(), # speaker_k_min_t (no change) | |
| gr.update(), # speaker_k_max_layers (no change) | |
| ) | |
| def apply_core_preset(preset_name): | |
| """Apply core sampling parameters preset.""" | |
| if preset_name == "default": | |
| return [ | |
| gr.update(value=0), # rng_seed | |
| gr.update(value=40), # num_steps | |
| gr.update(value="independent"), # cfg_mode | |
| gr.update(value="Custom"), # Set main preset to Custom | |
| ] | |
| return [gr.update()] * 4 | |
| def apply_cfg_preset(preset_name): | |
| """Apply CFG guidance preset.""" | |
| presets = { | |
| "default": (3.0, 5.0, 0.5, 1.0), | |
| "higher speaker": (3.0, 8.0, 0.5, 1.0), | |
| "large guidances": (8.0, 8.0, 0.5, 1.0), | |
| } | |
| if preset_name not in presets: | |
| return [gr.update()] * 5 | |
| text_scale, speaker_scale, min_t, max_t = presets[preset_name] | |
| return [ | |
| gr.update(value=text_scale), # cfg_scale_text | |
| gr.update(value=speaker_scale), # cfg_scale_speaker | |
| gr.update(value=min_t), # cfg_min_t | |
| gr.update(value=max_t), # cfg_max_t | |
| gr.update(value="Custom"), # Set main preset to Custom | |
| ] | |
| def apply_speaker_kv_preset(preset_name): | |
| """Apply speaker KV attention control preset.""" | |
| if preset_name == "enable": | |
| return [ | |
| gr.update(value=True), # speaker_k_enable | |
| gr.update(visible=True), # speaker_k_row | |
| gr.update(value="Custom"), # Set main preset to Custom | |
| ] | |
| elif preset_name == "off": | |
| return [ | |
| gr.update(value=False), # speaker_k_enable | |
| gr.update(visible=False), # speaker_k_row | |
| gr.update(value="Custom"), # Set main preset to Custom | |
| ] | |
| return [gr.update()] * 3 | |
| def apply_truncation_preset(preset_name): | |
| """Apply truncation & temporal rescaling preset.""" | |
| presets = { | |
| "flat": (0.8, 1.2, 3.0), | |
| "sharp": (0.9, 0.96, 3.0), | |
| "baseline(sharp)": (1.0, 1.0, 3.0), | |
| } | |
| if preset_name == "custom" or preset_name not in presets: | |
| return [gr.update()] * 4 # Return no changes for custom | |
| truncation, rescale_k, rescale_sigma = presets[preset_name] | |
| return [ | |
| gr.update(value=truncation), | |
| gr.update(value=rescale_k), | |
| gr.update(value=rescale_sigma), | |
| gr.update(value="Custom"), # Set main preset to Custom | |
| ] | |
| def apply_apg_preset(preset_name): | |
| """Apply APG parameters preset.""" | |
| presets = { | |
| "default": (0.5, 0.5, -0.25, -0.25, "", ""), # default: -0.25 momentum | |
| "no momentum": (0.0, 0.0, 0.0, 0.0, "", ""), # no momentum: 0 momentum | |
| "norms": (0.5, 0.5, -0.25, -0.25, "7.5", "7.5"), # norms: default + 7.5 norms | |
| "no eta": (0.0, 0.0, -0.25, -0.25, "", ""), # no eta: 0 eta | |
| } | |
| if preset_name not in presets: | |
| return [gr.update()] * 7 | |
| eta_text, eta_speaker, momentum_text, momentum_speaker, norm_text, norm_speaker = presets[preset_name] | |
| return [ | |
| gr.update(value=eta_text), # apg_eta_text | |
| gr.update(value=eta_speaker), # apg_eta_speaker | |
| gr.update(value=momentum_text), # apg_momentum_text | |
| gr.update(value=momentum_speaker), # apg_momentum_speaker | |
| gr.update(value=norm_text), # apg_norm_text | |
| gr.update(value=norm_speaker), # apg_norm_speaker | |
| gr.update(value="Custom"), # Set main preset to Custom | |
| ] | |
| def load_sampler_presets(): | |
| """Load sampler presets from JSON file.""" | |
| if SAMPLER_PRESETS_PATH.exists(): | |
| with open(SAMPLER_PRESETS_PATH, 'r') as f: | |
| return json.load(f) | |
| else: | |
| # Create default presets (will use existing JSON file if it exists) | |
| default_presets = { | |
| "Flat (Independent)": { | |
| "num_steps": "30", | |
| "cfg_mode": "independent", | |
| "cfg_scale_text": "3.0", | |
| "cfg_scale_speaker": "5.0", | |
| "cfg_min_t": "0.5", | |
| "cfg_max_t": "1.0", | |
| "truncation_factor": "0.8", | |
| "rescale_k": "1.2", | |
| "rescale_sigma": "3.0" | |
| }, | |
| "Sharp (Independent)": { | |
| "num_steps": "30", | |
| "cfg_mode": "independent", | |
| "cfg_scale_text": "3.0", | |
| "cfg_scale_speaker": "5.0", | |
| "cfg_min_t": "0.5", | |
| "cfg_max_t": "1.0", | |
| "truncation_factor": "0.9", | |
| "rescale_k": "0.96", | |
| "rescale_sigma": "3.0" | |
| }, | |
| } | |
| with open(SAMPLER_PRESETS_PATH, 'w') as f: | |
| json.dump(default_presets, f, indent=2) | |
| return default_presets | |
| def apply_sampler_preset(preset_name): | |
| """Apply a sampler preset to all fields.""" | |
| presets = load_sampler_presets() | |
| if preset_name == "Custom" or preset_name not in presets: | |
| return [gr.update()] * 20 # Return no changes for custom | |
| preset = presets[preset_name] | |
| # Determine visibility based on cfg_mode | |
| cfg_mode_value = preset["cfg_mode"] | |
| speaker_visible = (cfg_mode_value != "joint-unconditional") | |
| apg_visible = (cfg_mode_value == "apg-independent") | |
| speaker_k_enabled = preset.get("speaker_k_enable", False) | |
| # Convert string values to numeric where appropriate | |
| def to_num(val, default): | |
| try: | |
| return float(val) if isinstance(val, str) else val | |
| except (ValueError, TypeError): | |
| return default | |
| return [ | |
| gr.update(value=int(to_num(preset["num_steps"], 40))), | |
| gr.update(value=preset["cfg_mode"]), | |
| gr.update(value=to_num(preset["cfg_scale_text"], 3.0)), | |
| gr.update(value=to_num(preset["cfg_scale_speaker"], 5.0), visible=speaker_visible), | |
| gr.update(value=to_num(preset["cfg_min_t"], 0.5)), | |
| gr.update(value=to_num(preset["cfg_max_t"], 1.0)), | |
| gr.update(value=to_num(preset["truncation_factor"], 0.8)), | |
| gr.update(value=to_num(preset["rescale_k"], 1.2)), # Now numeric | |
| gr.update(value=to_num(preset["rescale_sigma"], 3.0)), | |
| gr.update(value=speaker_k_enabled), | |
| gr.update(visible=speaker_k_enabled), # speaker_k_row | |
| gr.update(value=to_num(preset.get("speaker_k_scale", "1.5"), 1.5)), | |
| gr.update(value=to_num(preset.get("speaker_k_min_t", "0.9"), 0.9)), | |
| gr.update(value=int(to_num(preset.get("speaker_k_max_layers", "24"), 24))), | |
| gr.update(value=to_num(preset.get("apg_eta_text", "0.0"), 0.0)), | |
| gr.update(value=to_num(preset.get("apg_eta_speaker", "0.0"), 0.0)), | |
| gr.update(value=to_num(preset.get("apg_momentum_text", "0.0"), 0.0)), | |
| gr.update(value=to_num(preset.get("apg_momentum_speaker", "0.0"), 0.0)), | |
| gr.update(value=preset.get("apg_norm_text", "")), # Keep as string (can be empty) | |
| gr.update(value=preset.get("apg_norm_speaker", "")), # Keep as string (can be empty) | |
| ] | |
| # Build Gradio Interface | |
| LINK_CSS = """ | |
| .preset-inline { display:flex; align-items:baseline; gap:6px; margin-top:-4px; margin-bottom:-12px; } | |
| .preset-inline .title { font-weight:600; font-size:.95rem; } | |
| .preset-inline .dim { color:#666; margin:0 4px; } | |
| /* blue, linky */ | |
| a.preset-link { color: #0a5bd8; text-decoration: underline; cursor: pointer; font-weight: 400; } | |
| a.preset-link:hover { text-decoration: none; opacity: 0.8; } | |
| /* Dark mode support for preset links */ | |
| .dark a.preset-link, | |
| [data-theme="dark"] a.preset-link { | |
| color: #60a5fa !important; | |
| } | |
| .dark a.preset-link:hover, | |
| [data-theme="dark"] a.preset-link:hover { | |
| color: #93c5fd !important; | |
| } | |
| .dark .preset-inline .dim, | |
| [data-theme="dark"] .preset-inline .dim { | |
| color: #9ca3af !important; | |
| } | |
| /* keep proxy buttons in DOM but invisible */ | |
| .proxy-btn { position:absolute; width:0; height:0; overflow:hidden; padding:0 !important; margin:0 !important; border:0 !important; opacity:0; pointer-events:none; } | |
| /* Better contrast for parameter group boxes */ | |
| .gr-group { | |
| border: 1px solid #d1d5db !important; | |
| background: #f3f4f6 !important; | |
| } | |
| .dark .gr-group, | |
| [data-theme="dark"] .gr-group { | |
| border: 1px solid #4b5563 !important; | |
| background: #1f2937 !important; | |
| } | |
| /* Highlight generated audio */ | |
| .generated-audio-player { | |
| border: 3px solid #667eea !important; | |
| border-radius: 12px !important; | |
| padding: 20px !important; | |
| background: linear-gradient(135deg, rgba(102, 126, 234, 0.08) 0%, rgba(118, 75, 162, 0.05) 100%) !important; | |
| box-shadow: 0 4px 12px rgba(102, 126, 234, 0.2) !important; | |
| margin: 1rem 0 !important; | |
| } | |
| .generated-audio-player > div { | |
| background: transparent !important; | |
| } | |
| /* Make Parameter Mode selector more prominent */ | |
| #component-mode-selector { | |
| text-align: center; | |
| padding: 1rem 0; | |
| } | |
| #component-mode-selector label { | |
| font-size: 1.1rem !important; | |
| font-weight: 600 !important; | |
| margin-bottom: 0.5rem !important; | |
| } | |
| #component-mode-selector .wrap { | |
| justify-content: center !important; | |
| } | |
| #component-mode-selector fieldset { | |
| border: 2px solid #e5e7eb !important; | |
| border-radius: 8px !important; | |
| padding: 1rem !important; | |
| background: #f9fafb !important; | |
| } | |
| .dark #component-mode-selector fieldset, | |
| [data-theme="dark"] #component-mode-selector fieldset { | |
| border: 2px solid #4b5563 !important; | |
| background: #1f2937 !important; | |
| } | |
| /* Stronger section separators */ | |
| .section-separator { | |
| height: 3px !important; | |
| background: linear-gradient(90deg, transparent 0%, #667eea 20%, #764ba2 80%, transparent 100%) !important; | |
| border: none !important; | |
| margin: 2rem 0 !important; | |
| } | |
| .dark .section-separator, | |
| [data-theme="dark"] .section-separator { | |
| background: linear-gradient(90deg, transparent 0%, #667eea 20%, #764ba2 80%, transparent 100%) !important; | |
| } | |
| /* Section headers styling */ | |
| .gradio-container h1, | |
| .gradio-container h2 { | |
| font-weight: 700 !important; | |
| margin-top: 1.5rem !important; | |
| margin-bottom: 1rem !important; | |
| } | |
| /* Highlighted tip box */ | |
| .tip-box { | |
| background: linear-gradient(135deg, #fef3c7 0%, #fde68a 100%) !important; | |
| border-left: 4px solid #f59e0b !important; | |
| border-radius: 8px !important; | |
| padding: 1rem 1.5rem !important; | |
| margin: 1rem 0 !important; | |
| box-shadow: 0 2px 4px rgba(245, 158, 11, 0.1) !important; | |
| } | |
| .tip-box strong { | |
| color: #92400e !important; | |
| } | |
| .dark .tip-box, | |
| [data-theme="dark"] .tip-box { | |
| background: linear-gradient(135deg, #451a03 0%, #78350f 100%) !important; | |
| border-left: 4px solid #f59e0b !important; | |
| } | |
| .dark .tip-box strong, | |
| [data-theme="dark"] .tip-box strong { | |
| color: #fbbf24 !important; | |
| } | |
| """ | |
| JS_CODE = r""" | |
| function () { | |
| // Get a queryable root, regardless of Shadow DOM | |
| const appEl = document.querySelector("gradio-app"); | |
| const root = appEl && appEl.shadowRoot ? appEl.shadowRoot : document; | |
| function clickHiddenButtonById(id) { | |
| if (!id) return; | |
| const host = root.getElementById(id); | |
| if (!host) return; | |
| const realBtn = host.querySelector("button, [role='button']") || host; | |
| realBtn.click(); | |
| } | |
| // Delegate clicks from any <a class="preset-link" data-fire="..."> | |
| root.addEventListener("click", (ev) => { | |
| const a = ev.target.closest("a.preset-link"); | |
| if (!a) return; | |
| ev.preventDefault(); | |
| ev.stopPropagation(); | |
| ev.stopImmediatePropagation(); | |
| clickHiddenButtonById(a.getAttribute("data-fire")); | |
| return false; | |
| }, true); | |
| } | |
| """ | |
| def init_session(): | |
| """Initialize session ID for this browser tab/session.""" | |
| return secrets.token_hex(8) | |
| def init_and_compile(): | |
| """Initialize session and trigger compilation on page load.""" | |
| session_id = secrets.token_hex(8) | |
| # Trigger compilation automatically on page load if not on Zero GPU | |
| # This ensures Simple mode (which defaults compile=True) gets compiled | |
| if not IS_ZEROGPU: | |
| # Just call do_compile directly - it will load models and compile | |
| # Status updates will be visible in Advanced mode, hidden in Simple mode | |
| status_update, checkbox_update = do_compile() | |
| return session_id, status_update, checkbox_update | |
| else: | |
| # On Zero GPU, don't try to compile | |
| return session_id, gr.update(), gr.update() | |
| with gr.Blocks(title="Echo-TTS", css=LINK_CSS, js=JS_CODE) as demo: | |
| gr.Markdown("# Echo-TTS") | |
| gr.Markdown("*Jordan Darefsky, 2025. See technical details [here](https://jordandarefsky.com/blog/2025/echo/)*") | |
| # License notice for Fish Speech autoencoder | |
| gr.Markdown("**License Notice:** All audio outputs are subject to non-commercial use only due to the [Fish Speech S1-DAC autoencoder](https://github.com/fishaudio/fish-speech) being licensed under [CC-BY-NC-SA-4.0](https://creativecommons.org/licenses/by-nc-sa/4.0/).") | |
| # Silentcipher watermarking notice | |
| if USE_SILENTCIPHER: | |
| gr.Markdown(f"*Audio output is watermarked with [silentcipher](https://github.com/sony/silentcipher) using message `{SILENTCIPHER_MESSAGE}`*") | |
| # Instructions for Simple Mode | |
| with gr.Accordion("📖 Quick Start Instructions", open=True): | |
| gr.Markdown(""" | |
| ### Simple Mode (Recommended for Beginners) | |
| 1. **Pick or upload a voice** - Choose from the voicebank or upload your own audio (up to 2 minutes) | |
| 2. **Choose a text prompt preset or enter your own prompt** - What you want the voice to say (the presets are a good guide for format/style) | |
| 3. **Select a Sampling preset** - The default preset "Independent (High Speaker CFG)" is usually good to start | |
| 4. **Click Generate Audio** - Wait for the model to generate your audio | |
| <div class="tip-box"> | |
| 💡 **Tip:** If the generated voice doesn't match the reference speaker at all, enable "Speaker KV Attention Scaling" and click Generate Audio again. | |
| </div> | |
| ### Advanced Mode | |
| Switch to Advanced mode for full control over all generation parameters including CFG scales, sampling steps, truncation, and more. | |
| ### Other tips | |
| High CFG settings are recommended but may lead to oversaturation; APG might help with this. Flat settings tend to reduce "impulse" artifacts but might result in worse (blunted/compressed/artifact-y) laughter, breathing, etc. generation. | |
| Echo will try to fit the entire text-prompt into (<=) 30 seconds of audio. If your prompt is very long, the generated speech may be too quick (this is not an issue for shorter text-prompts). For disfluent, single-speaker speech, we recommend trying the reference text beginning with "[S1] ... explore how we can design" as a starting point. | |
| """) | |
| # Session state for per-user file management | |
| session_id_state = gr.State(None) | |
| # Hidden state variables to store paths and selection | |
| selected_speaker_state = gr.Textbox(visible=False, value="") | |
| speaker_st_path_state = gr.Textbox(visible=False, value="") | |
| speaker_audio_path_state = gr.Textbox(visible=False, value="") | |
| gr.Markdown("# Voice Selection") | |
| # Dataset selector | |
| dataset_selector = gr.Radio( | |
| choices=["Custom Audio Panel", "EARS", "VCTK", "Expresso", "HF-Custom"], | |
| value="Custom Audio Panel", | |
| label="Select Dataset", | |
| info="Choose which voicebank to use" | |
| ) | |
| dataset_license_info = gr.Markdown( | |
| "", | |
| visible=False | |
| ) | |
| # Custom Audio Panel UI (visible by default, takes full width) | |
| with gr.Row(visible=True) as custom_audio_row: | |
| # Optional: Audio prompt library table (only shown if AUDIO_PROMPT_FOLDER is configured) | |
| if AUDIO_PROMPT_FOLDER is not None and AUDIO_PROMPT_FOLDER.exists(): | |
| with gr.Column(scale=1, min_width=200): | |
| gr.Markdown("#### Audio Library (favorite examples from voicebank datasets)") | |
| audio_prompt_table = gr.Dataframe( | |
| value=get_audio_prompt_files(), | |
| headers=["Filename"], | |
| datatype=["str"], | |
| row_count=(10, "dynamic"), | |
| col_count=(1, "fixed"), | |
| interactive=False, | |
| label="Click to select (or upload your own audio file directly on the right)" | |
| ) | |
| with gr.Column(scale=2): | |
| custom_audio_input = gr.Audio( | |
| sources=["upload", "microphone"], | |
| type="filepath", | |
| label="Speaker Reference Audio (only first two minutes will be used; leave empty for zero speaker conditioning)", | |
| max_length=600 # Maximum duration in seconds (10 minutes) | |
| ) | |
| with gr.Row(visible=False) as voicebank_row: | |
| # Voice selection UI for all voicebank datasets | |
| # EARS UI (visible by default when voicebank_row is shown) | |
| with gr.Column(scale=2, visible=True) as ears_column: | |
| gr.Markdown("### 1. Speakers (EARS)") | |
| selected_speaker_display = gr.Textbox( | |
| value="", | |
| label="", | |
| show_label=False, | |
| interactive=False, | |
| visible=False, | |
| lines=2, | |
| max_lines=2 | |
| ) | |
| speaker_search = gr.Textbox( | |
| placeholder="Search speakers (by ID, gender, age, ethnicity, language)...", | |
| label="", | |
| show_label=False, | |
| container=False | |
| ) | |
| speakers_table = gr.Dataframe( | |
| value=get_speakers_table(), | |
| headers=["ID", "G", "Age", "Ethnicity", "Native Lang"], | |
| datatype=["str", "str", "str", "str", "str"], | |
| row_count=(8, "dynamic"), | |
| col_count=(5, "fixed"), | |
| interactive=False, | |
| label="Click any cell to select", | |
| column_widths=["10%", "8%", "15%", "30%", "37%"] | |
| ) | |
| # VCTK UI (hidden by default) | |
| with gr.Column(scale=2, visible=False) as vctk_column: | |
| gr.Markdown("### 1. Speakers (VCTK)") | |
| vctk_speaker_display = gr.Textbox( | |
| value="", | |
| label="", | |
| show_label=False, | |
| interactive=False, | |
| visible=False, | |
| lines=2, | |
| max_lines=2 | |
| ) | |
| vctk_speaker_search = gr.Textbox( | |
| placeholder="Search speakers (by ID, gender, age, details)...", | |
| label="", | |
| show_label=False, | |
| container=False | |
| ) | |
| vctk_speakers_table = gr.Dataframe( | |
| value=get_vctk_speakers_table(), | |
| headers=["ID", "G", "Age", "Details", "Length"], | |
| datatype=["str", "str", "str", "str", "str"], | |
| row_count=(8, "dynamic"), | |
| col_count=(5, "fixed"), | |
| interactive=False, | |
| label="Click any cell to select", | |
| column_widths=["10%", "8%", "12%", "50%", "20%"] | |
| ) | |
| # Expresso UI (hidden by default) | |
| with gr.Column(scale=2, visible=False) as expresso_column: | |
| gr.Markdown("### 1. Voices (Expresso)") | |
| expresso_speaker_display = gr.Textbox( | |
| value="", | |
| label="", | |
| show_label=False, | |
| interactive=False, | |
| visible=False, | |
| lines=2, | |
| max_lines=2 | |
| ) | |
| expresso_speaker_search = gr.Textbox( | |
| placeholder="Search voices (by ID, type, speakers, style)...", | |
| label="", | |
| show_label=False, | |
| container=False | |
| ) | |
| expresso_speakers_table = gr.Dataframe( | |
| value=get_expresso_speakers_table(), | |
| headers=["ID", "Type", "Speakers", "Style", "Length"], | |
| datatype=["str", "str", "str", "str", "str"], | |
| row_count=(8, "dynamic"), | |
| col_count=(5, "fixed"), | |
| interactive=False, | |
| label="Click any cell to select", | |
| column_widths=["35%", "15%", "15%", "15%", "20%"] | |
| ) | |
| # HF-Custom UI (hidden by default) | |
| with gr.Column(scale=2, visible=False) as hf_custom_column: | |
| gr.Markdown("### 1. Voices (HF-Custom)") | |
| hf_custom_speaker_display = gr.Textbox( | |
| value="", | |
| label="", | |
| show_label=False, | |
| interactive=False, | |
| visible=False, | |
| lines=2, | |
| max_lines=2 | |
| ) | |
| hf_custom_speaker_search = gr.Textbox( | |
| placeholder="Search voices (by name, dataset, description)...", | |
| label="", | |
| show_label=False, | |
| container=False | |
| ) | |
| hf_custom_speakers_table = gr.Dataframe( | |
| value=get_hf_custom_speakers_table(), | |
| headers=["Name", "Dataset", "Description", "Length"], | |
| datatype=["str", "str", "str", "str"], | |
| row_count=(8, "dynamic"), | |
| col_count=(4, "fixed"), | |
| interactive=False, | |
| label="Click any cell to select", | |
| column_widths=["15%", "15%", "50%", "20%"] | |
| ) | |
| with gr.Column(scale=1, visible=True) as voice_type_column: | |
| gr.Markdown("### 2. Voice Type") | |
| selected_voice_display = gr.Textbox( | |
| value="", | |
| label="", | |
| show_label=False, | |
| interactive=False, | |
| visible=False, | |
| lines=2, | |
| max_lines=2 | |
| ) | |
| freeform_table = gr.Dataframe( | |
| value=[], | |
| headers=["Type", "Length"], | |
| datatype=["str", "str"], | |
| row_count=(1, "fixed"), | |
| col_count=(2, "fixed"), | |
| interactive=False, | |
| label="Freeform voice", | |
| visible=True, | |
| column_widths=["60%", "40%"] | |
| ) | |
| gr.Markdown("**Emotions:**") | |
| emotions_table = gr.Dataframe( | |
| value=[], | |
| headers=["Emotion", "Length"], | |
| datatype=["str", "str"], | |
| row_count=(8, "dynamic"), | |
| col_count=(2, "fixed"), | |
| interactive=False, | |
| visible=True, | |
| column_widths=["60%", "40%"] | |
| ) | |
| with gr.Column(scale=1): | |
| gr.Markdown("### 3. Audio Preview") | |
| audio_preview = gr.Audio(label="Voice Sample", type="filepath", interactive=False) | |
| gr.HTML('<hr class="section-separator">') | |
| gr.Markdown("# Text Prompt") | |
| with gr.Accordion("Text Presets", open=True): | |
| text_presets_table = gr.Dataframe( | |
| value=load_text_presets(), | |
| headers=["Category", "Words", "Preset Text"], | |
| datatype=["str", "str", "str"], | |
| row_count=(3, "dynamic"), | |
| col_count=(3, "fixed"), | |
| interactive=False, | |
| column_widths=["12%", "6%", "82%"] | |
| ) | |
| text_prompt = gr.Textbox( | |
| label="Text Prompt", | |
| placeholder="[S1] Enter your text prompt here...", | |
| lines=4 | |
| ) | |
| gr.HTML('<hr class="section-separator">') | |
| gr.Markdown("# Generation") | |
| # Mode selector: Simple or Advanced (outside the accordion, centered and prominent) | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| pass # Empty column for spacing | |
| with gr.Column(scale=2): | |
| mode_selector = gr.Radio( | |
| choices=["Simple Mode", "Advanced Mode"], | |
| value="Simple Mode", | |
| label="", | |
| info=None, | |
| elem_id="component-mode-selector" | |
| ) | |
| with gr.Column(scale=1): | |
| pass # Empty column for spacing | |
| with gr.Accordion("⚙️ Generation Parameters", open=True): | |
| with gr.Row(): | |
| presets = load_sampler_presets() | |
| preset_keys = list(presets.keys()) | |
| first_preset = preset_keys[0] if preset_keys else "Custom" | |
| preset_dropdown = gr.Dropdown( | |
| choices=["Custom"] + preset_keys, | |
| value=first_preset, # Default to first preset instead of Custom | |
| label="Sampler Preset", | |
| info="Load preset configurations", | |
| scale=2 | |
| ) | |
| rng_seed = gr.Number( | |
| label="RNG Seed", | |
| value=0, | |
| info="Random seed for starting noise", | |
| precision=0, | |
| scale=1 | |
| ) | |
| # Simple mode: Speaker KV checkbox on same row (visible by default) | |
| with gr.Column(scale=1, visible=True) as simple_mode_row: | |
| speaker_kv_simple_checkbox = gr.Checkbox( | |
| label="\"Force Speaker\" (Enable Speaker KV Attention Scaling)", | |
| value=False, | |
| info="Enable if generation does not match reference voice (otherwise leave off)" | |
| ) | |
| # Advanced mode: Compile and custom shapes checkboxes (hidden by default) | |
| with gr.Column(scale=1, visible=False) as advanced_mode_compile_column: | |
| compile_checkbox = gr.Checkbox( | |
| label="Compile Model", | |
| value=True, # Default to True in simple mode | |
| interactive=not IS_ZEROGPU, | |
| info="Compile disabled on Zero GPU" if IS_ZEROGPU else "~20-30% faster after initial compilation" | |
| ) | |
| compile_status = gr.Markdown( | |
| value="⚠️ Compile disabled on Zero GPU" if IS_ZEROGPU else "", | |
| visible=IS_ZEROGPU | |
| ) | |
| use_custom_shapes_checkbox = gr.Checkbox( | |
| label="Use Custom Shapes (Advanced)", | |
| value=False, | |
| info="Override default sequence lengths for text, speaker, and sample" | |
| ) | |
| # Advanced mode controls (hidden by default) | |
| with gr.Column(visible=False) as advanced_mode_column: | |
| with gr.Row(visible=False) as custom_shapes_row: | |
| max_text_byte_length = gr.Textbox( | |
| label="Max Text Byte Length (padded)", | |
| value="768", | |
| info="Maximum text utf-8 byte sequence length (blank -> no padding)", | |
| scale=1 | |
| ) | |
| max_speaker_latent_length = gr.Textbox( | |
| label="Max Speaker Latent Length (padded)", | |
| value="2560", | |
| info="Maximum (unpatched)speaker latent length (blank -> no padding), default 2560 = ~30s", | |
| scale=1 | |
| ) | |
| sample_latent_len = gr.Textbox( | |
| label="Sample Latent Length", | |
| value="640", | |
| info="Maximum sample latent length (EXPERIMENTAL!!! ONLY TRAINED WITH 640 BUT SOMEHOW WORKS WITH < 640 TO GENERATE PREFIXES)", | |
| scale=1 | |
| ) | |
| with gr.Row(): | |
| # Left column: Core Sampling Parameters | |
| with gr.Column(scale=1): | |
| with gr.Group(): | |
| gr.HTML(""" | |
| <div class="preset-inline"> | |
| <span class="title">Core Sampling Parameters</span><span class="dim">(</span> | |
| <a href="javascript:void(0)" class="preset-link" data-fire="core_default">default</a> | |
| <span class="dim">)</span> | |
| </div> | |
| """) | |
| core_preset_default = gr.Button("", elem_id="core_default", elem_classes=["proxy-btn"]) | |
| num_steps = gr.Number(label="Number of Steps", value=40, info="Number of sampling steps (consider 20 - 80) (capped at 80)", precision=0, minimum=1, step=5, maximum=80) | |
| cfg_mode = gr.Radio( | |
| choices=[ | |
| "independent", | |
| "apg-independent", | |
| "alternating", | |
| "joint-unconditional" | |
| ], | |
| value="independent", | |
| label="CFG Mode", | |
| info="Independent (3 NFE), Adaptive Projected Guidance (3 NFE, see https://arxiv.org/abs/2410.02416), Alternating (2 NFE), Joint-Unconditional (2 NFE)" | |
| ) | |
| with gr.Group(): | |
| gr.HTML(""" | |
| <div class="preset-inline"> | |
| <span class="title">CFG Guidance</span><span class="dim">(</span> | |
| <a href="javascript:void(0)" class="preset-link" data-fire="cfg_default">default</a> | |
| <span class="dim">,</span> | |
| <a href="javascript:void(0)" class="preset-link" data-fire="cfg_higher">higher speaker</a> | |
| <span class="dim">,</span> | |
| <a href="javascript:void(0)" class="preset-link" data-fire="cfg_large">large guidances(works with apg)</a> | |
| <span class="dim">)</span> | |
| </div> | |
| """) | |
| cfg_preset_default = gr.Button("", elem_id="cfg_default", elem_classes=["proxy-btn"]) | |
| cfg_preset_higher_speaker = gr.Button("", elem_id="cfg_higher", elem_classes=["proxy-btn"]) | |
| cfg_preset_large_guidances = gr.Button("", elem_id="cfg_large", elem_classes=["proxy-btn"]) | |
| with gr.Row(): | |
| cfg_scale_text = gr.Number(label="Text CFG Scale", value=3.0, info="Guidance strength for text", minimum=0, step=0.5) | |
| cfg_scale_speaker = gr.Number(label="Speaker CFG Scale", value=5.0, info="Guidance strength for speaker", minimum=0, step=0.5) | |
| with gr.Row(): | |
| cfg_min_t = gr.Number(label="CFG Min t", value=0.5, info="(0-1), CFG applied when t >= val", minimum=0, maximum=1, step=0.05) | |
| cfg_max_t = gr.Number(label="CFG Max t", value=1.0, info="(0-1), CFG applied when t <= val", minimum=0, maximum=1, step=0.05) | |
| # Right column: Speaker KV, Truncation + APG | |
| with gr.Column(scale=1): | |
| with gr.Group(): | |
| gr.HTML(""" | |
| <div class="preset-inline"> | |
| <span class="title">Speaker KV Attention Scaling</span><span class="dim">(</span> | |
| <a href="javascript:void(0)" class="preset-link" data-fire="spk_kv_enable">enable if generation does not match reference</a> | |
| <span class="dim">,</span> | |
| <a href="javascript:void(0)" class="preset-link" data-fire="spk_kv_off">off</a> | |
| <span class="dim">)</span> | |
| </div> | |
| """) | |
| spk_kv_preset_enable = gr.Button("", elem_id="spk_kv_enable", elem_classes=["proxy-btn"]) | |
| spk_kv_preset_off = gr.Button("", elem_id="spk_kv_off", elem_classes=["proxy-btn"]) | |
| speaker_k_enable = gr.Checkbox(label="Enable Speaker KV Scaling", value=False, info="Scale speaker attention key-values; useful when the model-generated audio does not at all match the reference audio (i.e. ignores speaker-reference)") | |
| with gr.Row(visible=False) as speaker_k_row: | |
| speaker_k_scale = gr.Number(label="KV Scale", value=1.5, info="Scale factor", minimum=0, step=0.1) | |
| speaker_k_min_t = gr.Number(label="KV Min t", value=0.9, info="(0-1), scale applied from steps t=1. to val", minimum=0, maximum=1, step=0.05) | |
| speaker_k_max_layers = gr.Number(label="Max Layers", value=24, info="(0-24), scale applied in first N layers", precision=0, minimum=0, maximum=24) | |
| with gr.Group(): | |
| gr.HTML(""" | |
| <div class="preset-inline"> | |
| <span class="title">Truncation & Temporal Rescaling</span><span class="dim">(</span> | |
| <a href="javascript:void(0)" class="preset-link" data-fire="trunc_flat">flat</a> | |
| <span class="dim">,</span> | |
| <a href="javascript:void(0)" class="preset-link" data-fire="trunc_sharp">sharp</a> | |
| <span class="dim">,</span> | |
| <a href="javascript:void(0)" class="preset-link" data-fire="trunc_baseline">baseline(sharp)</a> | |
| <span class="dim">)</span> | |
| </div> | |
| """) | |
| trunc_preset_flat = gr.Button("", elem_id="trunc_flat", elem_classes=["proxy-btn"]) | |
| trunc_preset_sharp = gr.Button("", elem_id="trunc_sharp", elem_classes=["proxy-btn"]) | |
| trunc_preset_baseline = gr.Button("", elem_id="trunc_baseline", elem_classes=["proxy-btn"]) | |
| with gr.Row(): | |
| truncation_factor = gr.Number(label="Truncation Factor", value=0.8, info="Multiply initial noise (<1 helps artifacts)", minimum=0, step=0.05) | |
| rescale_k = gr.Number(label="Rescale k", value=1.2, info="<1=sharpen, >1=flatten, 1=off", minimum=0, step=0.05) | |
| rescale_sigma = gr.Number(label="Rescale σ", value=3.0, info="Sigma parameter", minimum=0, step=0.1) | |
| with gr.Group(visible=False) as apg_row: | |
| gr.HTML(""" | |
| <div class="preset-inline"> | |
| <span class="title">APG Parameters</span><span class="dim">(</span> | |
| <a href="javascript:void(0)" class="preset-link" data-fire="apg_default">default</a> | |
| <span class="dim">,</span> | |
| <a href="javascript:void(0)" class="preset-link" data-fire="apg_no_momentum">no momentum</a> | |
| <span class="dim">,</span> | |
| <a href="javascript:void(0)" class="preset-link" data-fire="apg_norms">norms</a> | |
| <span class="dim">,</span> | |
| <a href="javascript:void(0)" class="preset-link" data-fire="apg_no_eta">no eta</a> | |
| <span class="dim">)</span> | |
| </div> | |
| """) | |
| apg_preset_default = gr.Button("", elem_id="apg_default", elem_classes=["proxy-btn"]) | |
| apg_preset_no_momentum = gr.Button("", elem_id="apg_no_momentum", elem_classes=["proxy-btn"]) | |
| apg_preset_norms = gr.Button("", elem_id="apg_norms", elem_classes=["proxy-btn"]) | |
| apg_preset_no_eta = gr.Button("", elem_id="apg_no_eta", elem_classes=["proxy-btn"]) | |
| with gr.Row(): | |
| apg_eta_text = gr.Number(label="APG η (text)", value=0.5, info="Eta for text projection (0-1, higher -> more like CFG)", minimum=0, maximum=1, step=0.25) | |
| apg_eta_speaker = gr.Number(label="APG η (speaker)", value=0.5, info="Eta for speaker projection (0-1, higher -> more like CFG)", minimum=0, maximum=1, step=0.25) | |
| with gr.Row() as apg_row2: | |
| apg_momentum_text = gr.Number(label="APG Momentum (text)", value=-0.25, info="Text momentum (can try 0., -.25, -0.5, -0.75...)", step=0.25) | |
| apg_momentum_speaker = gr.Number(label="APG Momentum (speaker)", value=-0.25, info="Speaker momentum (can try 0., -.25, -0.5, -0.75...)", step=0.25) | |
| with gr.Row(): | |
| apg_norm_text = gr.Textbox(label="APG Norm (text)", value="", info="Text norm clip (leave blank to disable, can try 7.5, 15.0)") | |
| apg_norm_speaker = gr.Textbox(label="APG Norm (speaker)", value="", info="Speaker norm clip (leave blank to disable, can try 7.5, 15.0)") | |
| # End of advanced_mode_column | |
| with gr.Row(equal_height=True): | |
| audio_format = gr.Radio( | |
| choices=["wav", "mp3"], | |
| value="wav", | |
| label="Format", | |
| scale=1, | |
| min_width=90 | |
| ) | |
| generate_btn = gr.Button("Generate Audio", variant="primary", size="lg", scale=10) | |
| with gr.Column(scale=1): | |
| show_original_audio = gr.Checkbox( | |
| label="Re-display original audio (full 2-minute cropped mono)", | |
| value=False | |
| ) | |
| reconstruct_first_30_seconds = gr.Checkbox( | |
| label="Show Autoencoder Reconstruction (only first 30s of reference)", | |
| value=False | |
| ) | |
| gr.HTML('<hr class="section-separator">') | |
| with gr.Accordion("Generated Audio", open=True, visible=True) as generated_section: | |
| generation_time_display = gr.Markdown("", visible=False) | |
| with gr.Group(elem_classes=["generated-audio-player"]): | |
| generated_audio = gr.Audio(label="Generated Audio", visible=True) | |
| text_prompt_display = gr.Markdown("", visible=False) | |
| gr.Markdown("---") | |
| reference_audio_header = gr.Markdown("#### Reference Audio", visible=False) | |
| with gr.Accordion("Original Audio (2 min Cropped Mono)", open=False, visible=False) as original_accordion: | |
| original_audio = gr.Audio(label="Original Reference Audio (2 min)", visible=True) | |
| with gr.Accordion("Autoencoder Reconstruction of First 30s of Reference", open=False, visible=False) as reference_accordion: | |
| reference_audio = gr.Audio(label="Decoded Reference Audio (30s)", visible=True) | |
| # Event handlers | |
| # Custom Audio Panel - handle audio change to update speaker_audio_path_state | |
| custom_audio_input.change( | |
| lambda audio: gr.update(value=audio if audio else ""), | |
| inputs=[custom_audio_input], | |
| outputs=[speaker_audio_path_state] | |
| ) | |
| # Audio prompt library table selection (only if configured) | |
| if AUDIO_PROMPT_FOLDER is not None and AUDIO_PROMPT_FOLDER.exists(): | |
| audio_prompt_table.select( | |
| select_audio_prompt_file, | |
| outputs=[custom_audio_input] | |
| ) | |
| # Dataset selector: switch between Custom Audio Panel, EARS, VCTK, Expresso, and HF-Custom | |
| dataset_selector.change( | |
| switch_dataset, | |
| inputs=[dataset_selector], | |
| outputs=[ | |
| dataset_license_info, custom_audio_row, voicebank_row, voice_type_column, | |
| ears_column, vctk_column, expresso_column, hf_custom_column, | |
| selected_speaker_display, freeform_table, emotions_table, | |
| selected_voice_display, vctk_speaker_display, expresso_speaker_display, hf_custom_speaker_display, selected_speaker_state, | |
| audio_preview, speaker_st_path_state, speaker_audio_path_state | |
| ] | |
| ) | |
| # EARS: Speaker search | |
| speaker_search.change( | |
| search_speakers, | |
| inputs=[speaker_search], | |
| outputs=[speakers_table] | |
| ) | |
| # EARS: Speaker selection - populate freeform and emotions tables | |
| speakers_table.select( | |
| select_speaker_from_table, | |
| inputs=[speakers_table], | |
| outputs=[selected_speaker_display, freeform_table, emotions_table, selected_speaker_state, audio_preview, speaker_st_path_state, speaker_audio_path_state, selected_voice_display] | |
| ) | |
| # VCTK: Speaker search | |
| vctk_speaker_search.change( | |
| search_vctk_speakers, | |
| inputs=[vctk_speaker_search], | |
| outputs=[vctk_speakers_table] | |
| ) | |
| # VCTK: Speaker selection - load voice files directly | |
| vctk_speakers_table.select( | |
| select_vctk_speaker_from_table, | |
| inputs=[vctk_speakers_table], | |
| outputs=[vctk_speaker_display, selected_speaker_state, audio_preview, speaker_st_path_state, speaker_audio_path_state] | |
| ) | |
| # Expresso: Speaker search | |
| expresso_speaker_search.change( | |
| search_expresso_speakers, | |
| inputs=[expresso_speaker_search], | |
| outputs=[expresso_speakers_table] | |
| ) | |
| # Expresso: Speaker selection - load voice files directly | |
| expresso_speakers_table.select( | |
| select_expresso_speaker_from_table, | |
| inputs=[expresso_speakers_table], | |
| outputs=[expresso_speaker_display, selected_speaker_state, audio_preview, speaker_st_path_state, speaker_audio_path_state] | |
| ) | |
| # HF-Custom: Speaker search | |
| hf_custom_speaker_search.change( | |
| search_hf_custom_speakers, | |
| inputs=[hf_custom_speaker_search], | |
| outputs=[hf_custom_speakers_table] | |
| ) | |
| # HF-Custom: Speaker selection - load voice files directly | |
| hf_custom_speakers_table.select( | |
| select_hf_custom_speaker_from_table, | |
| inputs=[hf_custom_speakers_table], | |
| outputs=[hf_custom_speaker_display, selected_speaker_state, audio_preview, speaker_st_path_state, speaker_audio_path_state] | |
| ) | |
| # Freeform selection: load freeform voice files | |
| freeform_table.select( | |
| select_freeform_from_table, | |
| inputs=[selected_speaker_state], | |
| outputs=[selected_voice_display, audio_preview, speaker_st_path_state, speaker_audio_path_state] | |
| ) | |
| # Emotion selection: load voice files | |
| emotions_table.select( | |
| select_emotion_from_table, | |
| inputs=[selected_speaker_state], | |
| outputs=[selected_voice_display, audio_preview, speaker_st_path_state, speaker_audio_path_state] | |
| ) | |
| text_presets_table.select(select_text_preset, outputs=text_prompt) | |
| # Mode selector handler | |
| mode_selector.change( | |
| toggle_mode, | |
| inputs=[mode_selector, speaker_k_enable, speaker_kv_simple_checkbox], | |
| outputs=[simple_mode_row, advanced_mode_compile_column, advanced_mode_column, speaker_kv_simple_checkbox, speaker_k_enable] | |
| ).then( | |
| # Sync the row visibility and values after mode switch | |
| lambda enabled: (gr.update(visible=enabled), gr.update(value=1.5 if enabled else 1.5), gr.update(value=0.9 if enabled else 0.9), gr.update(value=24 if enabled else 24)), | |
| inputs=[speaker_k_enable], | |
| outputs=[speaker_k_row, speaker_k_scale, speaker_k_min_t, speaker_k_max_layers] | |
| ) | |
| # Simple mode speaker KV checkbox handler | |
| speaker_kv_simple_checkbox.change( | |
| sync_simple_to_advanced, | |
| inputs=[speaker_kv_simple_checkbox], | |
| outputs=[speaker_k_enable, speaker_k_row, speaker_k_scale, speaker_k_min_t, speaker_k_max_layers] | |
| ) | |
| cfg_mode.change(update_cfg_visibility, inputs=cfg_mode, outputs=[cfg_scale_text, cfg_scale_speaker, apg_row]) | |
| # Speaker K enable handler - toggle row visibility and sync with simple mode | |
| speaker_k_enable.change( | |
| lambda enabled: (gr.update(visible=enabled), gr.update(value=enabled)), | |
| inputs=[speaker_k_enable], | |
| outputs=[speaker_k_row, speaker_kv_simple_checkbox] | |
| ) | |
| # Custom shapes enable handler - toggle row visibility and reset to defaults on disable | |
| def toggle_custom_shapes(enabled): | |
| if enabled: | |
| return ( | |
| gr.update(visible=True), | |
| gr.update(), | |
| gr.update(), | |
| gr.update(), | |
| ) | |
| else: | |
| return ( | |
| gr.update(visible=False), | |
| gr.update(value="768"), | |
| gr.update(value="2560"), | |
| gr.update(value="640"), | |
| ) | |
| use_custom_shapes_checkbox.change( | |
| toggle_custom_shapes, | |
| inputs=[use_custom_shapes_checkbox], | |
| outputs=[custom_shapes_row, max_text_byte_length, max_speaker_latent_length, sample_latent_len] | |
| ) | |
| # Core preset handler | |
| core_preset_default.click( | |
| lambda: apply_core_preset("default"), | |
| outputs=[rng_seed, num_steps, cfg_mode, preset_dropdown] | |
| ) | |
| # CFG preset handlers | |
| cfg_preset_default.click( | |
| lambda: apply_cfg_preset("default"), | |
| outputs=[cfg_scale_text, cfg_scale_speaker, cfg_min_t, cfg_max_t, preset_dropdown] | |
| ) | |
| cfg_preset_higher_speaker.click( | |
| lambda: apply_cfg_preset("higher speaker"), | |
| outputs=[cfg_scale_text, cfg_scale_speaker, cfg_min_t, cfg_max_t, preset_dropdown] | |
| ) | |
| cfg_preset_large_guidances.click( | |
| lambda: apply_cfg_preset("large guidances"), | |
| outputs=[cfg_scale_text, cfg_scale_speaker, cfg_min_t, cfg_max_t, preset_dropdown] | |
| ) | |
| # Speaker KV preset handlers | |
| spk_kv_preset_enable.click( | |
| lambda: apply_speaker_kv_preset("enable"), | |
| outputs=[speaker_k_enable, speaker_k_row, preset_dropdown] | |
| ) | |
| spk_kv_preset_off.click( | |
| lambda: apply_speaker_kv_preset("off"), | |
| outputs=[speaker_k_enable, speaker_k_row, preset_dropdown] | |
| ) | |
| # Truncation preset handlers | |
| trunc_preset_flat.click( | |
| lambda: apply_truncation_preset("flat"), | |
| outputs=[truncation_factor, rescale_k, rescale_sigma, preset_dropdown] | |
| ) | |
| trunc_preset_sharp.click( | |
| lambda: apply_truncation_preset("sharp"), | |
| outputs=[truncation_factor, rescale_k, rescale_sigma, preset_dropdown] | |
| ) | |
| trunc_preset_baseline.click( | |
| lambda: apply_truncation_preset("baseline(sharp)"), | |
| outputs=[truncation_factor, rescale_k, rescale_sigma, preset_dropdown] | |
| ) | |
| # APG preset handlers | |
| apg_preset_default.click( | |
| lambda: apply_apg_preset("default"), | |
| outputs=[apg_eta_text, apg_eta_speaker, apg_momentum_text, apg_momentum_speaker, apg_norm_text, apg_norm_speaker, preset_dropdown] | |
| ) | |
| apg_preset_no_momentum.click( | |
| lambda: apply_apg_preset("no momentum"), | |
| outputs=[apg_eta_text, apg_eta_speaker, apg_momentum_text, apg_momentum_speaker, apg_norm_text, apg_norm_speaker, preset_dropdown] | |
| ) | |
| apg_preset_norms.click( | |
| lambda: apply_apg_preset("norms"), | |
| outputs=[apg_eta_text, apg_eta_speaker, apg_momentum_text, apg_momentum_speaker, apg_norm_text, apg_norm_speaker, preset_dropdown] | |
| ) | |
| apg_preset_no_eta.click( | |
| lambda: apply_apg_preset("no eta"), | |
| outputs=[apg_eta_text, apg_eta_speaker, apg_momentum_text, apg_momentum_speaker, apg_norm_text, apg_norm_speaker, preset_dropdown] | |
| ) | |
| # Preset handler | |
| preset_dropdown.change( | |
| apply_sampler_preset, | |
| inputs=preset_dropdown, | |
| outputs=[num_steps, cfg_mode, cfg_scale_text, cfg_scale_speaker, cfg_min_t, cfg_max_t, truncation_factor, rescale_k, rescale_sigma, speaker_k_enable, speaker_k_row, speaker_k_scale, speaker_k_min_t, speaker_k_max_layers, apg_eta_text, apg_eta_speaker, apg_momentum_text, apg_momentum_speaker, apg_norm_text, apg_norm_speaker] | |
| ) | |
| # Compile handler | |
| compile_checkbox.change( | |
| compile_model, | |
| inputs=compile_checkbox, | |
| outputs=[compile_checkbox, compile_status] | |
| ).then( | |
| do_compile, | |
| outputs=[compile_status, compile_checkbox] | |
| ) | |
| generate_btn.click( | |
| generate_audio, | |
| inputs=[ | |
| text_prompt, | |
| speaker_st_path_state, | |
| speaker_audio_path_state, | |
| num_steps, | |
| rng_seed, | |
| cfg_mode, | |
| cfg_scale_text, | |
| cfg_scale_speaker, | |
| cfg_min_t, | |
| cfg_max_t, | |
| truncation_factor, | |
| rescale_k, | |
| rescale_sigma, | |
| speaker_k_enable, | |
| speaker_k_scale, | |
| speaker_k_min_t, | |
| speaker_k_max_layers, | |
| apg_eta_text, | |
| apg_eta_speaker, | |
| apg_momentum_text, | |
| apg_momentum_speaker, | |
| apg_norm_text, | |
| apg_norm_speaker, | |
| reconstruct_first_30_seconds, | |
| use_custom_shapes_checkbox, | |
| max_text_byte_length, | |
| max_speaker_latent_length, | |
| sample_latent_len, | |
| audio_format, | |
| compile_checkbox, # Pass compile state to choose model | |
| show_original_audio, | |
| session_id_state, | |
| ], | |
| outputs=[generated_section, generated_audio, text_prompt_display, original_audio, generation_time_display, reference_audio, original_accordion, reference_accordion, reference_audio_header] | |
| ) | |
| # Initialize session ID and trigger compilation when the page loads | |
| demo.load(init_and_compile, outputs=[session_id_state, compile_status, compile_checkbox]).then( | |
| # Apply the first preset on load | |
| lambda: apply_sampler_preset(list(load_sampler_presets().keys())[0]), | |
| outputs=[num_steps, cfg_mode, cfg_scale_text, cfg_scale_speaker, cfg_min_t, cfg_max_t, truncation_factor, rescale_k, rescale_sigma, speaker_k_enable, speaker_k_row, speaker_k_scale, speaker_k_min_t, speaker_k_max_layers, apg_eta_text, apg_eta_speaker, apg_momentum_text, apg_momentum_speaker, apg_norm_text, apg_norm_speaker] | |
| ) | |
| if __name__ == "__main__": | |
| # For HF-Custom, allow the entire dataset cache directory to handle subdirectories | |
| hf_custom_cache = HF_CUSTOM_PATH.parent.parent.parent | |
| allowed_paths = [ | |
| str(EARS_PATH), | |
| str(VCTK_PATH), | |
| str(EXPRESSO_PATH), | |
| str(hf_custom_cache), | |
| str(TEMP_AUDIO_DIR), | |
| str(AUDIO_PROMPT_FOLDER) | |
| ] | |
| # Enable queue for better handling of concurrent requests on HF Spaces | |
| demo.queue(max_size=20) | |
| demo.launch(allowed_paths=allowed_paths) | |