Spaces:

mrfakename
/

DMOSpeech2

Running on Zero

App Files Files Community

DMOSpeech2 / unimodel.py

mrfakename

pt 1

597cecf 6 days ago

raw

history blame contribute delete

12.5 kB

	from __future__ import annotations

	import contextlib
	import copy
	import os
	from pathlib import Path
	from random import random
	from typing import Callable

	import torch
	from torch import nn

	from f5_tts.model import DiT, UNetT
	from f5_tts.model.utils import (default, exists, lens_to_mask, list_str_to_idx,
	list_str_to_tensor, mask_from_frac_lengths,
	sample_consecutive_steps, sample_from_list)
	from guidance_model import Guidance


	class UniModel(nn.Module):
	def __init__(
	self,
	model: DiT, # teacher model (dit model)
	checkpoint_path: str = "",
	second_time: bool = True,
	use_fp16: bool = True,
	real_guidance_scale: float = 2.0,
	fake_guidance_scale: float = 0.0,
	gen_cls_loss: bool = False,
	sway_coeff: float = -1.0,
	vocab_char_map: dict[str, int] \| None = None,
	frac_lengths_mask: tuple[float, float] = (0.7, 1.0),
	):

	super().__init__()

	if checkpoint_path != "":
	if "model_last.pt" in os.listdir(checkpoint_path):
	latest_checkpoint = "model_last.pt"
	else:
	latest_checkpoint = sorted(
	[f for f in os.listdir(checkpoint_path) if f.endswith(".pt")],
	key=lambda x: int("".join(filter(str.isdigit, x))),
	)[-1]
	checkpoint = torch.load(
	f"{checkpoint_path}/{latest_checkpoint}",
	weights_only=True,
	map_location="cpu",
	)

	if "scale" in checkpoint:
	self.scale = checkpoint["scale"]
	else:
	self.scale = 1.0
	print(f"Loaded teacher model with scale: {self.scale}")

	if "step" in checkpoint:
	state = checkpoint["model_state_dict"]
	else:
	checkpoint["model_state_dict"] = {
	k.replace("ema_model.", ""): v
	for k, v in checkpoint["ema_model_state_dict"].items()
	if k not in ["initted", "step"]
	}
	state = checkpoint["model_state_dict"]

	# only load the DiT module
	filtered_state_dict = {
	k.replace("transformer.", ""): v
	for k, v in state.items()
	if k.startswith("transformer.")
	}

	model.load_state_dict(filtered_state_dict, strict=False)
	else:
	self.scale = 1.0

	real_unet = copy.deepcopy(model)
	real_unet.time_embed2 = None

	fake_unet = copy.deepcopy(model)

	# Instantiate Guidance, which internally uses real_unet and fake_unet initialized from the teacher
	self.guidance_model = Guidance(
	real_unet=real_unet,
	fake_unet=fake_unet,
	use_fp16=use_fp16,
	real_guidance_scale=real_guidance_scale,
	fake_guidance_scale=fake_guidance_scale,
	gen_cls_loss=gen_cls_loss,
	sway_coeff=sway_coeff,
	)

	self.feedforward_model = copy.deepcopy(model) # initialize the student model
	self.feedforward_model.requires_grad_(True)
	self.feedforward_model.time_embed2 = None

	self.vocab_char_map = vocab_char_map
	self.frac_lengths_mask = frac_lengths_mask

	self.second_time = second_time # fake_unet.time_embed2 is not None

	def forward(
	self,
	inp: float["b n d"], # mel
	text: int["b nt"] \| list[str],
	*,
	lens: int["b"] \| None = None,
	student_steps: list[int] = [0, 0.25, 0.5, 0.75],
	update_generator: bool = False,
	):
	"""
	Forward pass that routes to either generator_forward or guidance_forward
	in the Guidance class, depending on the arguments.

	Parameters:
	-----------
	generator_turn: bool
	If True, run the generator forward pass (distribution matching loss, etc.)
	guidance_turn: bool
	If True, run the guidance forward pass (fake loss, cls loss, etc.)
	data_dict: dict
	Input dictionary containing the necessary keys for the forward passes.
	Expected keys may include:
	"inp": Tensor (B, N, D) - input mel or latent
	"text": Tensor or list[str] - text input
	"rand_span_mask": Tensor (B, N) - boolean mask
	"real_data": dict with keys like:
	"inp", "text", "rand_span_mask"

	Returns:
	--------
	loss_dict: dict[str, Tensor]
	Dictionary of losses.
	log_dict: dict[str, Tensor or float]
	Dictionary of logging tensors or values.
	"""

	batch, seq_len, dtype, device = *inp.shape[:2], inp.dtype, inp.device

	# handle text as string
	if isinstance(text, list):
	if exists(self.vocab_char_map):
	text = list_str_to_idx(text, self.vocab_char_map).to(device)
	else:
	text = list_str_to_tensor(text).to(device)
	assert text.shape[0] == batch

	# lens and mask
	if not exists(lens):
	lens = torch.full((batch,), seq_len, device=device)

	mask = lens_to_mask(
	lens, length=seq_len
	) # useless here, as collate_fn will pad to max length in batch

	# sample from the list of student steps
	time = sample_from_list(student_steps, batch).to(device)
	c_time, p_time = sample_consecutive_steps(student_steps)
	time = torch.ones_like(time) * c_time
	p_time = torch.ones_like(time) * p_time

	frac_lengths = (
	torch.zeros((batch,), device=device)
	.float()
	.uniform_(*self.frac_lengths_mask)
	)
	rand_span_mask = mask_from_frac_lengths(lens, frac_lengths)

	if exists(mask):
	rand_span_mask &= mask

	# # use generated output from previous step as input
	with torch.no_grad():
	x1 = inp
	x0 = torch.randn_like(x1)
	t = p_time.unsqueeze(-1).unsqueeze(-1)
	phi = (1 - t) * x0 + t * x1
	cond = torch.where(rand_span_mask[..., None], torch.zeros_like(x1), x1)

	pred = self.feedforward_model(
	x=phi,
	cond=cond,
	text=text,
	time=p_time,
	drop_audio_cond=False,
	drop_text=False, # make sure the cfg=1
	) # flow prediction

	# predicted mel spectrogram
	output = phi + (1 - t) * pred
	output[~rand_span_mask] = inp[~rand_span_mask]

	# forward diffusion
	x1 = output
	x0 = torch.randn_like(x1)
	t = time.unsqueeze(-1).unsqueeze(-1)
	phi = (1 - t) * x0 + t * x1
	cond = torch.where(rand_span_mask[..., None], torch.zeros_like(x1), x1)

	with torch.no_grad() if not update_generator else contextlib.nullcontext():
	pred = self.feedforward_model(
	x=phi,
	cond=cond,
	text=text,
	time=time,
	drop_audio_cond=False,
	drop_text=False, # make sure no cfg is used
	)

	# predicted mel spectrogram
	output = phi + (1 - t) * pred
	output[~rand_span_mask] = inp[~rand_span_mask]

	if update_generator:
	generator_data_dict = {
	"inp": output,
	"text": text,
	"rand_span_mask": rand_span_mask,
	"second_time": time if self.second_time else None,
	"mse_loss": time.mean() == student_steps[-1].mean(),
	"real_data": {
	"inp": inp,
	"text": text,
	"rand_span_mask": rand_span_mask,
	},
	}

	# avoid any side effects of gradient accumulation
	# self.guidance_model.requires_grad_(False)
	# self.feedforward_model.requires_grad_(True)
	generator_loss_dict, generator_log_dict = self.guidance_model(
	generator_turn=True,
	guidance_turn=False,
	generator_data_dict=generator_data_dict,
	guidance_data_dict=None,
	)

	generator_log_dict["ground_truth"] = x1
	generator_log_dict["generator_input"] = phi
	generator_log_dict["generator_output"] = output
	generator_log_dict["generator_cond"] = cond
	generator_log_dict["time"] = time

	return generator_loss_dict, generator_log_dict
	else:
	guidance_data_dict = {
	"inp": output.detach(),
	"text": text,
	"rand_span_mask": rand_span_mask,
	"second_time": time if self.second_time else None,
	"real_data": {
	"inp": inp,
	"text": text,
	"rand_span_mask": rand_span_mask,
	},
	}

	# avoid any side effects of gradient accumulation
	# self.feedforward_model.requires_grad_(False)
	# self.guidance_model.requires_grad_(True)
	guidance_loss_dict, guidance_log_dict = self.guidance_model(
	generator_turn=False,
	guidance_turn=True,
	generator_data_dict=None,
	guidance_data_dict=guidance_data_dict,
	)
	# self.feedforward_model.requires_grad_(True)

	return guidance_loss_dict, guidance_log_dict

	# return guidance_loss_dict, guidance_log_dict, generator_loss_dict, generator_log_dict


	if __name__ == "__main__":

	from torch.utils.data import DataLoader, Dataset, SequentialSampler

	from f5_tts.model.dataset import (DynamicBatchSampler, collate_fn,
	load_dataset)
	from f5_tts.model.utils import get_tokenizer

	bsz = 16

	tokenizer = "pinyin" # 'pinyin', 'char', or 'custom'
	tokenizer_path = None # if tokenizer = 'custom', define the path to the tokenizer you want to use (should be vocab.txt)
	dataset_name = "Emilia_ZH_EN"
	if tokenizer == "custom":
	tokenizer_path = tokenizer_path
	else:
	tokenizer_path = dataset_name
	vocab_char_map, vocab_size = get_tokenizer(tokenizer_path, tokenizer)

	dit = DiT(
	dim=1024,
	depth=22,
	heads=16,
	ff_mult=2,
	text_dim=512,
	conv_layers=4,
	text_num_embeds=vocab_size,
	mel_dim=100,
	)

	model = UniModel(
	dit,
	checkpoint_path="/data4/F5TTS/ckpts/F5TTS_Base_norm_flow_8GPU_vocos_pinyin_Emilia_ZH_EN",
	gen_cls_loss=True,
	vocab_char_map=vocab_char_map,
	frac_lengths_mask=(0.7, 1.0),
	).cuda()

	# batch = next(iter(train_dataloader))
	# torch.save(batch, "batch.pt")
	batch = torch.load("batch.pt")
	inp, text, lens = (
	batch["mel"].permute(0, 2, 1).cuda(),
	batch["text"],
	batch["mel_lengths"].cuda(),
	)

	# text = ["hello world"] * bsz
	# lens = torch.randint(1, 1000, (bsz,)).cuda()
	# inp = torch.randn(bsz, lens.max(), 100).cuda()
	with torch.autocast(device_type="cuda", dtype=torch.float16):
	num_student_step = 4

	guidance_loss_dict, guidance_log_dict = model(
	inp,
	text,
	lens=lens,
	update_generator=False,
	student_steps=(torch.linspace(0.0, 1.0, num_student_step + 1)[:-1]),
	)

	generator_loss_dict, generator_log_dict = model(
	inp,
	text,
	lens=lens,
	update_generator=True,
	student_steps=(torch.linspace(0.0, 1.0, num_student_step + 1)[:-1]),
	)

	print(guidance_loss_dict)
	print(generator_loss_dict)

	guidance_loss = 0
	guidance_loss += guidance_loss_dict["loss_fake_mean"]
	guidance_loss += guidance_loss_dict["guidance_cls_loss"]

	generator_loss = 0
	generator_loss += generator_loss_dict["loss_dm"]
	generator_loss += generator_loss_dict["loss_ctc"]
	generator_loss += generator_loss_dict["loss_sim"]
	generator_loss += generator_loss_dict["gen_cls_loss"]
	generator_loss += generator_loss_dict["loss_mse"]

	guidance_loss.backward()
	generator_loss.backward()