Spaces:

elvinan
/

new-vi-tts

Configuration error

new-vi-tts / infer /f5tts_wrapper.py

Elvin

init source

ec7446a 4 months ago

22.4 kB

	import os
	import torch
	import torchaudio
	import numpy as np
	from pathlib import Path
	from typing import Optional, Union, List, Tuple, Dict

	from cached_path import cached_path
	from hydra.utils import get_class
	from omegaconf import OmegaConf
	from importlib.resources import files
	from pydub import AudioSegment, silence

	from f5_tts.model import CFM
	from f5_tts.model.utils import (
	get_tokenizer,
	convert_char_to_pinyin,
	)
	from f5_tts.infer.utils_infer import (
	chunk_text,
	load_vocoder,
	transcribe,
	initialize_asr_pipeline,
	)


	class F5TTSWrapper:
	"""
	A wrapper class for F5-TTS that preprocesses reference audio once
	and allows for repeated TTS generation.
	"""

	def __init__(
	self,
	model_name: str = "F5TTS_v1_Base",
	ckpt_path: Optional[str] = None,
	vocab_file: Optional[str] = None,
	vocoder_name: str = "vocos",
	use_local_vocoder: bool = False,
	vocoder_path: Optional[str] = None,
	device: Optional[str] = None,
	hf_cache_dir: Optional[str] = None,
	target_sample_rate: int = 24000,
	n_mel_channels: int = 100,
	hop_length: int = 256,
	win_length: int = 1024,
	n_fft: int = 1024,
	ode_method: str = "euler",
	use_ema: bool = True,
	):
	"""
	Initialize the F5-TTS wrapper with model configuration.

	Args:
	model_name: Name of the F5-TTS model variant (e.g., "F5TTS_v1_Base")
	ckpt_path: Path to the model checkpoint file. If None, will use default path.
	vocab_file: Path to the vocab file. If None, will use default.
	vocoder_name: Name of the vocoder to use ("vocos" or "bigvgan")
	use_local_vocoder: Whether to use a local vocoder or download from HF
	vocoder_path: Path to the local vocoder. Only used if use_local_vocoder is True.
	device: Device to run the model on. If None, will automatically determine.
	hf_cache_dir: Directory to cache HuggingFace models
	target_sample_rate: Target sample rate for audio
	n_mel_channels: Number of mel channels
	hop_length: Hop length for the mel spectrogram
	win_length: Window length for the mel spectrogram
	n_fft: FFT size for the mel spectrogram
	ode_method: ODE method for sampling ("euler" or "midpoint")
	use_ema: Whether to use EMA weights from the checkpoint
	"""
	# Set device
	if device is None:
	self.device = (
	"cuda" if torch.cuda.is_available()
	else "xpu" if torch.xpu.is_available()
	else "mps" if torch.backends.mps.is_available()
	else "cpu"
	)
	else:
	self.device = device

	# Audio processing parameters
	self.target_sample_rate = target_sample_rate
	self.n_mel_channels = n_mel_channels
	self.hop_length = hop_length
	self.win_length = win_length
	self.n_fft = n_fft
	self.mel_spec_type = vocoder_name

	# Sampling parameters
	self.ode_method = ode_method

	# Initialize ASR for transcription if needed
	initialize_asr_pipeline(device=self.device)

	# Load model configuration
	if ckpt_path is None:
	repo_name = "F5-TTS"
	ckpt_step = 1250000
	ckpt_type = "safetensors"

	# Adjust for previous models
	if model_name == "F5TTS_Base":
	if vocoder_name == "vocos":
	ckpt_step = 1200000
	elif vocoder_name == "bigvgan":
	model_name = "F5TTS_Base_bigvgan"
	ckpt_type = "pt"
	elif model_name == "E2TTS_Base":
	repo_name = "E2-TTS"
	ckpt_step = 1200000

	ckpt_path = str(cached_path(f"hf://SWivid/{repo_name}/{model_name}/model_{ckpt_step}.{ckpt_type}"))

	# Load model configuration
	config_path = str(files("f5_tts").joinpath(f"configs/{model_name}.yaml"))
	model_cfg = OmegaConf.load(config_path)
	model_cls = get_class(f"f5_tts.model.{model_cfg.model.backbone}")
	model_arc = model_cfg.model.arch

	# Load tokenizer
	if vocab_file is None:
	vocab_file = str(files("f5_tts").joinpath("infer/examples/vocab.txt"))
	tokenizer_type = "custom"
	self.vocab_char_map, vocab_size = get_tokenizer(vocab_file, tokenizer_type)

	# Create model
	self.model = CFM(
	transformer=model_cls(**model_arc, text_num_embeds=vocab_size, mel_dim=n_mel_channels),
	mel_spec_kwargs=dict(
	n_fft=n_fft,
	hop_length=hop_length,
	win_length=win_length,
	n_mel_channels=n_mel_channels,
	target_sample_rate=target_sample_rate,
	mel_spec_type=vocoder_name,
	),
	odeint_kwargs=dict(
	method=ode_method,
	),
	vocab_char_map=self.vocab_char_map,
	).to(self.device)

	# Load checkpoint
	dtype = torch.float32 if vocoder_name == "bigvgan" else None
	self._load_checkpoint(self.model, ckpt_path, dtype=dtype, use_ema=use_ema)

	# Load vocoder
	if vocoder_path is None:
	if vocoder_name == "vocos":
	vocoder_path = "../checkpoints/vocos-mel-24khz"
	elif vocoder_name == "bigvgan":
	vocoder_path = "../checkpoints/bigvgan_v2_24khz_100band_256x"

	self.vocoder = load_vocoder(
	vocoder_name=vocoder_name,
	is_local=use_local_vocoder,
	local_path=vocoder_path,
	device=self.device,
	hf_cache_dir=hf_cache_dir
	)

	# Storage for reference data
	self.ref_audio_processed = None
	self.ref_text = None
	self.ref_audio_len = None

	# Default inference parameters
	self.target_rms = 0.1
	self.cross_fade_duration = 0.15
	self.nfe_step = 32
	self.cfg_strength = 2.0
	self.sway_sampling_coef = -1.0
	self.speed = 1.0
	self.fix_duration = None

	def _load_checkpoint(self, model, ckpt_path, dtype=None, use_ema=True):
	"""
	Load model checkpoint with proper handling of different checkpoint formats.

	Args:
	model: The model to load weights into
	ckpt_path: Path to the checkpoint file
	dtype: Data type for model weights
	use_ema: Whether to use EMA weights from the checkpoint

	Returns:
	Loaded model
	"""
	if dtype is None:
	dtype = (
	torch.float16
	if "cuda" in self.device
	and torch.cuda.get_device_properties(self.device).major >= 7
	and not torch.cuda.get_device_name().endswith("[ZLUDA]")
	else torch.float32
	)
	model = model.to(dtype)

	ckpt_type = ckpt_path.split(".")[-1]
	if ckpt_type == "safetensors":
	from safetensors.torch import load_file
	checkpoint = load_file(ckpt_path, device=self.device)
	else:
	checkpoint = torch.load(ckpt_path, map_location=self.device, weights_only=True)

	if use_ema:
	if ckpt_type == "safetensors":
	checkpoint = {"ema_model_state_dict": checkpoint}
	checkpoint["model_state_dict"] = {
	k.replace("ema_model.", ""): v
	for k, v in checkpoint["ema_model_state_dict"].items()
	if k not in ["initted", "step"]
	}

	# patch for backward compatibility
	for key in ["mel_spec.mel_stft.mel_scale.fb", "mel_spec.mel_stft.spectrogram.window"]:
	if key in checkpoint["model_state_dict"]:
	del checkpoint["model_state_dict"][key]

	model.load_state_dict(checkpoint["model_state_dict"])
	else:
	if ckpt_type == "safetensors":
	checkpoint = {"model_state_dict": checkpoint}
	model.load_state_dict(checkpoint["model_state_dict"])

	del checkpoint
	torch.cuda.empty_cache()

	return model.to(self.device)

	def preprocess_reference(self, ref_audio_path: str, ref_text: str = "", clip_short: bool = True):
	"""
	Preprocess the reference audio and text, storing them for later use.

	Args:
	ref_audio_path: Path to the reference audio file
	ref_text: Text transcript of reference audio. If empty, will auto-transcribe.
	clip_short: Whether to clip long audio to shorter segments

	Returns:
	Tuple of processed audio and text
	"""
	print("Converting audio...")
	# Load audio file
	aseg = AudioSegment.from_file(ref_audio_path)

	if clip_short:
	# 1. try to find long silence for clipping
	non_silent_segs = silence.split_on_silence(
	aseg, min_silence_len=1000, silence_thresh=-50, keep_silence=1000, seek_step=10
	)
	non_silent_wave = AudioSegment.silent(duration=0)
	for non_silent_seg in non_silent_segs:
	if len(non_silent_wave) > 6000 and len(non_silent_wave + non_silent_seg) > 12000:
	print("Audio is over 12s, clipping short. (1)")
	break
	non_silent_wave += non_silent_seg

	# 2. try to find short silence for clipping if 1. failed
	if len(non_silent_wave) > 12000:
	non_silent_segs = silence.split_on_silence(
	aseg, min_silence_len=100, silence_thresh=-40, keep_silence=1000, seek_step=10
	)
	non_silent_wave = AudioSegment.silent(duration=0)
	for non_silent_seg in non_silent_segs:
	if len(non_silent_wave) > 6000 and len(non_silent_wave + non_silent_seg) > 12000:
	print("Audio is over 12s, clipping short. (2)")
	break
	non_silent_wave += non_silent_seg

	aseg = non_silent_wave

	# 3. if no proper silence found for clipping
	if len(aseg) > 12000:
	aseg = aseg[:12000]
	print("Audio is over 12s, clipping short. (3)")

	# Remove silence edges
	aseg = self._remove_silence_edges(aseg) + AudioSegment.silent(duration=50)

	# Export to temporary file and load as tensor
	import tempfile
	with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
	aseg.export(tmp_file.name, format="wav")
	processed_audio_path = tmp_file.name

	# Transcribe if needed
	if not ref_text.strip():
	print("No reference text provided, transcribing reference audio...")
	ref_text = transcribe(processed_audio_path)
	else:
	print("Using custom reference text...")

	# Ensure ref_text ends with proper punctuation
	if not ref_text.endswith(". ") and not ref_text.endswith("。"):
	if ref_text.endswith("."):
	ref_text += " "
	else:
	ref_text += ". "

	print("\nReference text:", ref_text)

	# Load and process audio
	audio, sr = torchaudio.load(processed_audio_path)
	if audio.shape[0] > 1: # Convert stereo to mono
	audio = torch.mean(audio, dim=0, keepdim=True)

	# Normalize volume
	rms = torch.sqrt(torch.mean(torch.square(audio)))
	if rms < self.target_rms:
	audio = audio * self.target_rms / rms

	# Resample if needed
	if sr != self.target_sample_rate:
	resampler = torchaudio.transforms.Resample(sr, self.target_sample_rate)
	audio = resampler(audio)

	# Move to device
	audio = audio.to(self.device)

	# Store reference data
	self.ref_audio_processed = audio
	self.ref_text = ref_text
	self.ref_audio_len = audio.shape[-1] // self.hop_length

	# Remove temporary file
	os.unlink(processed_audio_path)

	return audio, ref_text

	def _remove_silence_edges(self, audio, silence_threshold=-42):
	"""
	Remove silence from the start and end of audio.

	Args:
	audio: AudioSegment to process
	silence_threshold: dB threshold to consider as silence

	Returns:
	Processed AudioSegment
	"""
	# Remove silence from the start
	non_silent_start_idx = silence.detect_leading_silence(audio, silence_threshold=silence_threshold)
	audio = audio[non_silent_start_idx:]

	# Remove silence from the end
	non_silent_end_duration = audio.duration_seconds
	for ms in reversed(audio):
	if ms.dBFS > silence_threshold:
	break
	non_silent_end_duration -= 0.001
	trimmed_audio = audio[: int(non_silent_end_duration * 1000)]

	return trimmed_audio

	def generate(
	self,
	text: str,
	output_path: Optional[str] = None,
	nfe_step: Optional[int] = None,
	cfg_strength: Optional[float] = None,
	sway_sampling_coef: Optional[float] = None,
	speed: Optional[float] = None,
	fix_duration: Optional[float] = None,
	cross_fade_duration: Optional[float] = None,
	return_numpy: bool = False,
	return_spectrogram: bool = False,
	) -> Union[str, Tuple[np.ndarray, int], Tuple[np.ndarray, int, np.ndarray]]:
	"""
	Generate speech for the given text using the stored reference audio.

	Args:
	text: Text to synthesize
	output_path: Path to save the generated audio. If None, won't save.
	nfe_step: Number of function evaluation steps
	cfg_strength: Classifier-free guidance strength
	sway_sampling_coef: Sway sampling coefficient
	speed: Speed of generated audio
	fix_duration: Fixed duration in seconds
	cross_fade_duration: Duration of cross-fade between segments
	return_numpy: If True, returns the audio as a numpy array
	return_spectrogram: If True, also returns the spectrogram

	Returns:
	If output_path provided: path to output file
	If return_numpy=True: tuple of (audio_array, sample_rate)
	If return_spectrogram=True: tuple of (audio_array, sample_rate, spectrogram)
	"""
	if self.ref_audio_processed is None or self.ref_text is None:
	raise ValueError("Reference audio not preprocessed. Call preprocess_reference() first.")

	# Use default values if not specified
	nfe_step = nfe_step if nfe_step is not None else self.nfe_step
	cfg_strength = cfg_strength if cfg_strength is not None else self.cfg_strength
	sway_sampling_coef = sway_sampling_coef if sway_sampling_coef is not None else self.sway_sampling_coef
	speed = speed if speed is not None else self.speed
	fix_duration = fix_duration if fix_duration is not None else self.fix_duration
	cross_fade_duration = cross_fade_duration if cross_fade_duration is not None else self.cross_fade_duration

	# Split the input text into batches
	audio_len = self.ref_audio_processed.shape[-1] / self.target_sample_rate
	max_chars = int(len(self.ref_text.encode("utf-8")) / audio_len * (22 - audio_len))
	text_batches = chunk_text(text, max_chars=max_chars)

	for i, text_batch in enumerate(text_batches):
	print(f"Text batch {i}: {text_batch}")
	print("\n")

	# Generate audio for each batch
	generated_waves = []
	spectrograms = []

	for text_batch in text_batches:
	# Adjust speed for very short texts
	local_speed = speed
	if len(text_batch.encode("utf-8")) < 10:
	local_speed = 0.3

	# Prepare the text
	text_list = [self.ref_text + text_batch]
	final_text_list = convert_char_to_pinyin(text_list)

	# Calculate duration
	if fix_duration is not None:
	duration = int(fix_duration * self.target_sample_rate / self.hop_length)
	else:
	# Calculate duration based on text length
	ref_text_len = len(self.ref_text.encode("utf-8"))
	gen_text_len = len(text_batch.encode("utf-8"))
	duration = self.ref_audio_len + int(self.ref_audio_len / ref_text_len * gen_text_len / local_speed)

	# Generate audio
	with torch.inference_mode():
	generated, _ = self.model.sample(
	cond=self.ref_audio_processed,
	text=final_text_list,
	duration=duration,
	steps=nfe_step,
	cfg_strength=cfg_strength,
	sway_sampling_coef=sway_sampling_coef,
	)

	# Process the generated mel spectrogram
	generated = generated.to(torch.float32)
	generated = generated[:, self.ref_audio_len:, :]
	generated = generated.permute(0, 2, 1)

	# Convert to audio
	if self.mel_spec_type == "vocos":
	generated_wave = self.vocoder.decode(generated)
	elif self.mel_spec_type == "bigvgan":
	generated_wave = self.vocoder(generated)

	# Normalize volume if needed
	rms = torch.sqrt(torch.mean(torch.square(self.ref_audio_processed)))
	if rms < self.target_rms:
	generated_wave = generated_wave * rms / self.target_rms

	# Convert to numpy and append to list
	generated_wave = generated_wave.squeeze().cpu().numpy()
	generated_waves.append(generated_wave)

	# Store spectrogram if needed
	if return_spectrogram or output_path is not None:
	spectrograms.append(generated.squeeze().cpu().numpy())

	# Combine all segments
	if generated_waves:
	if cross_fade_duration <= 0:
	# Simply concatenate
	final_wave = np.concatenate(generated_waves)
	else:
	# Cross-fade between segments
	final_wave = generated_waves[0]
	for i in range(1, len(generated_waves)):
	prev_wave = final_wave
	next_wave = generated_waves[i]

	# Calculate cross-fade samples
	cross_fade_samples = int(cross_fade_duration * self.target_sample_rate)
	cross_fade_samples = min(cross_fade_samples, len(prev_wave), len(next_wave))

	if cross_fade_samples <= 0:
	# No overlap possible, concatenate
	final_wave = np.concatenate([prev_wave, next_wave])
	continue

	# Create cross-fade
	prev_overlap = prev_wave[-cross_fade_samples:]
	next_overlap = next_wave[:cross_fade_samples]

	fade_out = np.linspace(1, 0, cross_fade_samples)
	fade_in = np.linspace(0, 1, cross_fade_samples)

	cross_faded_overlap = prev_overlap * fade_out + next_overlap * fade_in

	final_wave = np.concatenate([
	prev_wave[:-cross_fade_samples],
	cross_faded_overlap,
	next_wave[cross_fade_samples:]
	])

	# Combine spectrograms if needed
	if return_spectrogram or output_path is not None:
	combined_spectrogram = np.concatenate(spectrograms, axis=1)

	# Save to file if path provided
	if output_path is not None:
	output_dir = os.path.dirname(output_path)
	if output_dir and not os.path.exists(output_dir):
	os.makedirs(output_dir)

	# Save audio
	torchaudio.save(output_path,
	torch.tensor(final_wave).unsqueeze(0),
	self.target_sample_rate)

	# Save spectrogram if needed
	if return_spectrogram:
	spectrogram_path = os.path.splitext(output_path)[0] + '_spec.png'
	self._save_spectrogram(combined_spectrogram, spectrogram_path)

	if not return_numpy:
	return output_path

	# Return as requested
	if return_spectrogram:
	return final_wave, self.target_sample_rate, combined_spectrogram
	else:
	return final_wave, self.target_sample_rate

	else:
	raise RuntimeError("No audio generated")

	def _save_spectrogram(self, spectrogram, path):
	"""Save spectrogram as image"""
	import matplotlib.pyplot as plt
	plt.figure(figsize=(12, 4))
	plt.imshow(spectrogram, origin="lower", aspect="auto")
	plt.colorbar()
	plt.savefig(path)
	plt.close()

	def get_current_audio_length(self):
	"""Get the length of the reference audio in seconds"""
	if self.ref_audio_processed is None:
	return 0
	return self.ref_audio_processed.shape[-1] / self.target_sample_rate