Spaces:

mrfakename
/

DMOSpeech2

Running on Zero

App Files Files Community

DMOSpeech2 / f5_tts /model /dataset.py

mrfakename

pt 1

597cecf 6 days ago

raw

history blame contribute delete

16.6 kB

	import json
	import random
	import re
	from importlib.resources import files

	import torch
	import torch.nn.functional as F
	import torchaudio
	from datasets import Dataset as Dataset_
	from datasets import load_from_disk
	from torch import nn
	from torch.utils.data import Dataset, Sampler
	from tqdm import tqdm

	from f5_tts.model.modules import MelSpec
	from f5_tts.model.utils import default


	def get_speaker_id(path):
	parts = path.split("/")
	speaker_id = parts[-3]
	return speaker_id


	class CustomDataset(Dataset):
	def __init__(
	self,
	custom_dataset: Dataset,
	durations=None,
	target_sample_rate=24_000,
	hop_length=256,
	n_mel_channels=100,
	n_fft=1024,
	win_length=1024,
	mel_spec_type="vocos",
	preprocessed_mel=False,
	mel_spec_module: nn.Module \| None = None,
	validation=False,
	validation_num=5000,
	data_augmentation=False,
	return_wavform=False,
	remove_starting_space=True,
	need_prompt_speech=False,
	prompt_repository: dict = None,
	):
	self.data = custom_dataset
	self.durations = durations
	self.target_sample_rate = target_sample_rate
	self.hop_length = hop_length
	self.n_fft = n_fft
	self.win_length = win_length
	self.mel_spec_type = mel_spec_type
	self.preprocessed_mel = preprocessed_mel

	if not preprocessed_mel:
	self.mel_spectrogram = default(
	mel_spec_module,
	MelSpec(
	n_fft=n_fft,
	hop_length=hop_length,
	win_length=win_length,
	n_mel_channels=n_mel_channels,
	target_sample_rate=target_sample_rate,
	mel_spec_type=mel_spec_type,
	),
	)

	self.validation = validation
	self.validation_num = validation_num

	if (not validation) and data_augmentation:
	print("Using data augmentation.")
	self.augment = Compose(
	[
	AddBackgroundNoise(
	sounds_path="/data5/ESC-50-master",
	min_snr_db=3.0,
	max_snr_db=30.0,
	noise_transform=PolarityInversion(),
	p=0.5,
	),
	AddGaussianNoise(min_amplitude=0.001, max_amplitude=0.015, p=0.5),
	PitchShift(min_semitones=-12.0, max_semitones=12.0, p=0.8),
	ApplyImpulseResponse(ir_path="/data5/Audio", p=1.0),
	Aliasing(min_sample_rate=4000, max_sample_rate=30000, p=0.3),
	BandPassFilter(min_center_freq=100.0, max_center_freq=6000, p=0.2),
	SevenBandParametricEQ(p=0.2),
	TanhDistortion(min_distortion=0.01, max_distortion=0.7, p=0.2),
	]
	)
	else:
	print("No data augmentation.")
	self.augment = None

	self.return_wavform = return_wavform
	self.remove_starting_space = remove_starting_space

	if need_prompt_speech:
	if prompt_repository == None:
	self.prompt_repository = {}
	for row in tqdm(self.data):
	audio_path = row["audio_path"]
	text = row["text"]
	duration = row["duration"]
	spk_id = get_speaker_id(audio_path)
	assert spk_id != None and spk_id != "mp3"
	if spk_id not in self.prompt_repository:
	self.prompt_repository[spk_id] = [row]
	else:
	self.prompt_repository[spk_id].append(row)
	else:
	self.prompt_repository = prompt_repository

	print(
	f"Grouped samples into {len(self.prompt_repository.keys())} speakers."
	)
	self.need_prompt_speech = True

	else:
	self.need_prompt_speech = False

	def get_frame_len(self, index):
	if self.validation:
	index += len(self.data) - self.validation_num

	if (
	self.durations is not None
	): # Please make sure the separately provided durations are correct, otherwise 99.99% OOM
	return self.durations[index] * self.target_sample_rate / self.hop_length
	return self.data[index]["duration"] * self.target_sample_rate / self.hop_length

	def __len__(self):
	if not self.validation:
	return len(self.data) - self.validation_num
	return self.validation_num

	def __getitem__(self, index, return_row=True, return_path=False):
	if self.validation:
	index += len(self.data) - self.validation_num

	out = {}

	while True:
	row = self.data[index]
	audio_path = row["audio_path"]
	text = row["text"]
	duration = row["duration"]

	if not isinstance(text, list):
	text = list(text)

	# filter by given length
	if (0.3 <= duration <= 30) and (0 < len(text) < 2048):
	break # valid

	index = (index + 1) % len(self.data)

	if self.remove_starting_space:
	while len(text) > 1 and text[0] == " ":
	text = text[1:]

	if self.preprocessed_mel:
	mel_spec = torch.tensor(row["mel_spec"])
	else:
	audio, source_sample_rate = torchaudio.load(audio_path)

	# make sure mono input
	if audio.shape[0] > 1:
	audio = torch.mean(audio, dim=0, keepdim=True)

	# resample if necessary
	if source_sample_rate != self.target_sample_rate:
	resampler = torchaudio.transforms.Resample(
	source_sample_rate, self.target_sample_rate
	)
	audio = resampler(audio)

	if not self.validation:
	if self.augment != None:
	audio = self.augment(
	audio.squeeze().numpy(), sample_rate=self.target_sample_rate
	)
	audio = torch.from_numpy(audio).float().unsqueeze(0)

	# to mel spectrogram
	mel_spec = self.mel_spectrogram(audio)
	mel_spec = mel_spec.squeeze(0) # '1 d t -> d t'

	out["mel_spec"] = mel_spec
	out["text"] = text
	out["duration"] = duration
	out["target_text"] = self.data[(index + len(self.data) // 2) % len(self.data)][
	"text"
	]

	if self.return_wavform:
	out["wav"] = audio

	if return_path:
	out["path"] = audio_path

	if return_row:
	out["row"] = row

	# Sample a prompt speech of the same speaker
	# From prompt_repository
	if self.need_prompt_speech:
	spk = get_speaker_id(audio_path)
	spk_repository = self.prompt_repository[spk]
	_count = 100
	while True:
	pmt_row = random.choice(spk_repository)
	pmt_audio_path = pmt_row["audio_path"]
	pmt_text = pmt_row["text"]
	pmt_duration = pmt_row["duration"]

	if not isinstance(pmt_text, list):
	pmt_text = list(pmt_text)

	# filter by given length
	if 0.3 <= pmt_duration <= 30 and (0 < len(pmt_text) < 2048):
	if pmt_text != text:
	break
	_count = _count - 1
	if _count <= 0:
	break

	if self.remove_starting_space:
	while len(pmt_text) > 1 and pmt_text[0] == " ":
	pmt_text = pmt_text[1:]

	if self.preprocessed_mel:
	pmt_mel_spec = torch.tensor(pmt_row["mel_spec"])
	else:
	pmt_audio, source_sample_rate = torchaudio.load(pmt_audio_path)

	# make sure mono input
	if pmt_audio.shape[0] > 1:
	pmt_audio = torch.mean(pmt_audio, dim=0, keepdim=True)

	# resample if necessary
	if source_sample_rate != self.target_sample_rate:
	resampler = torchaudio.transforms.Resample(
	source_sample_rate, self.target_sample_rate
	)
	pmt_audio = resampler(pmt_audio)

	if not self.validation:
	if self.augment != None:
	pmt_audio = self.augment(
	pmt_audio.squeeze().numpy(),
	sample_rate=self.target_sample_rate,
	)
	pmt_audio = torch.from_numpy(pmt_audio).float().unsqueeze(0)

	# to mel spectrogram
	pmt_mel_spec = self.mel_spectrogram(pmt_audio)
	pmt_mel_spec = pmt_mel_spec.squeeze(0) # '1 d t -> d t'

	out["pmt_mel_spec"] = pmt_mel_spec
	out["pmt_text"] = pmt_text
	out["pmt_duration"] = pmt_duration

	if self.return_wavform:
	out["pmt_wav"] = pmt_audio

	if return_path:
	out["pmt_path"] = pmt_audio_path

	if return_row:
	out["pmt_row"] = pmt_row

	return out


	# Dynamic Batch Sampler
	class DynamicBatchSampler(Sampler[list[int]]):
	"""Extension of Sampler that will do the following:
	1. Change the batch size (essentially number of sequences)
	in a batch to ensure that the total number of frames are less
	than a certain threshold.
	2. Make sure the padding efficiency in the batch is high.
	"""

	def __init__(
	self,
	sampler: Sampler[int],
	frames_threshold: int,
	max_samples=0,
	random_seed=None,
	drop_last: bool = False,
	):
	self.sampler = sampler
	self.frames_threshold = frames_threshold
	self.max_samples = max_samples

	indices, batches = [], []
	data_source = self.sampler.data_source

	# for idx in tqdm(
	# self.sampler, desc="Sorting with sampler... if slow, check whether dataset is provided with duration"
	# ):
	for idx in self.sampler:
	indices.append((idx, data_source.get_frame_len(idx)))
	indices.sort(key=lambda elem: elem[1])

	batch = []
	batch_frames = 0
	# for idx, frame_len in tqdm(
	# indices, desc=f"Creating dynamic batches with {frames_threshold} audio frames per gpu"
	# ):
	for idx, frame_len in indices:
	if batch_frames + frame_len <= self.frames_threshold and (
	max_samples == 0 or len(batch) < max_samples
	):
	batch.append(idx)
	batch_frames += frame_len
	else:
	if len(batch) > 0:
	batches.append(batch)
	if frame_len <= self.frames_threshold:
	batch = [idx]
	batch_frames = frame_len
	else:
	batch = []
	batch_frames = 0

	if not drop_last and len(batch) > 0:
	batches.append(batch)

	del indices

	# if want to have different batches between epochs, may just set a seed and log it in ckpt
	# cuz during multi-gpu training, although the batch on per gpu not change between epochs, the formed general minibatch is different
	# e.g. for epoch n, use (random_seed + n)
	random.seed(random_seed)
	random.shuffle(batches)

	self.batches = batches

	def __iter__(self):
	return iter(self.batches)

	def __len__(self):
	return len(self.batches)


	# Load dataset


	def load_dataset(
	dataset_name: str,
	tokenizer: str = "pinyin",
	dataset_type: str = "CustomDataset",
	audio_type: str = "raw",
	mel_spec_module: nn.Module \| None = None,
	mel_spec_kwargs: dict = dict(),
	split: str = "train",
	data_augmentation: bool = False,
	return_wavform: bool = False,
	remove_starting_space: bool = True,
	need_prompt_speech: bool = False,
	prompt_repository: dict = None,
	) -> CustomDataset:
	"""
	dataset_type - "CustomDataset" if you want to use tokenizer name and default data path to load for train_dataset
	- "CustomDatasetPath" if you just want to pass the full path to a preprocessed dataset without relying on tokenizer
	"""

	print("Loading dataset ...")

	if dataset_type == "CustomDataset":
	rel_data_path = str(
	f"/home/yl4579/F5-TTS-diff/F5-TTS-DMD-flow-ds/data/{dataset_name}_{tokenizer}"
	)
	if "LibriTTS_100_360_500_char_pinyin" in rel_data_path:
	rel_data_path = rel_data_path.replace(
	"LibriTTS_100_360_500_char_pinyin", "LibriTTS_100_360_500_char"
	)
	if audio_type == "raw":
	try:
	train_dataset = load_from_disk(f"{rel_data_path}/raw")
	except: # noqa: E722
	train_dataset = Dataset_.from_file(f"{rel_data_path}/raw.arrow")
	preprocessed_mel = False
	elif audio_type == "mel":
	train_dataset = Dataset_.from_file(f"{rel_data_path}/mel.arrow")
	preprocessed_mel = True
	with open(f"{rel_data_path}/duration.json", "r", encoding="utf-8") as f:
	data_dict = json.load(f)
	durations = data_dict["duration"]
	train_dataset = CustomDataset(
	train_dataset,
	durations=durations,
	preprocessed_mel=preprocessed_mel,
	mel_spec_module=mel_spec_module,
	**mel_spec_kwargs,
	validation=split == "val",
	data_augmentation=data_augmentation,
	return_wavform=return_wavform,
	remove_starting_space=remove_starting_space,
	need_prompt_speech=need_prompt_speech,
	prompt_repository=prompt_repository,
	)

	elif dataset_type == "CustomDatasetPath":
	try:
	train_dataset = load_from_disk(f"{dataset_name}/raw")
	except: # noqa: E722
	train_dataset = Dataset_.from_file(f"{dataset_name}/raw.arrow")

	with open(f"{dataset_name}/duration.json", "r", encoding="utf-8") as f:
	data_dict = json.load(f)
	durations = data_dict["duration"]
	train_dataset = CustomDataset(
	train_dataset,
	durations=durations,
	preprocessed_mel=preprocessed_mel,
	**mel_spec_kwargs,
	)

	return train_dataset


	# collation
	def collate_fn(batch):
	# Extract mel_specs and their lengths
	mel_specs = [item["mel_spec"].squeeze(0) for item in batch]
	mel_lengths = torch.LongTensor([spec.shape[-1] for spec in mel_specs])
	max_mel_length = mel_lengths.amax()

	# Pad mel_specs
	padded_mel_specs = []
	for spec in mel_specs: # TODO. maybe records mask for attention here
	padding = (0, max_mel_length - spec.size(-1))
	padded_spec = F.pad(spec, padding, value=0)
	padded_mel_specs.append(padded_spec)
	mel_specs = torch.stack(padded_mel_specs)

	text = [item["text"] for item in batch]
	target_text = [item["target_text"] for item in batch]

	text_lengths = torch.LongTensor([len(item) for item in text])

	out = dict(
	mel=mel_specs,
	mel_lengths=mel_lengths,
	text=text,
	text_lengths=text_lengths,
	target_text=target_text,
	)

	if "pmt_mel_spec" in batch[0]:
	pmt_mel_specs = [item["pmt_mel_spec"].squeeze(0) for item in batch]
	pmt_mel_lengths = torch.LongTensor([spec.shape[-1] for spec in pmt_mel_specs])
	max_pmt_mel_length = pmt_mel_lengths.amax()

	# Pad mel_specs
	padded_pmt_mel_specs = []
	for spec in pmt_mel_specs:
	padding = (0, max_pmt_mel_length - spec.size(-1))
	padded_spec = F.pad(spec, padding, value=0)
	padded_pmt_mel_specs.append(padded_spec)
	pmt_mel_specs = torch.stack(padded_pmt_mel_specs)

	out["pmt_mel_specs"] = pmt_mel_specs

	if "pmt_text" in batch[0]:
	pmt_text = [item["pmt_text"] for item in batch]
	pmt_text_lengths = torch.LongTensor([len(item) for item in pmt_text])

	out["pmt_text"] = pmt_text
	out["pmt_text_lengths"] = pmt_text_lengths

	return out