from ..models import ModelManager, SD3TextEncoder1, HunyuanVideoVAEDecoder, HunyuanVideoVAEEncoder from ..models.hunyuan_video_dit import HunyuanVideoDiT from ..models.hunyuan_video_text_encoder import HunyuanVideoLLMEncoder from ..schedulers.flow_match import FlowMatchScheduler from .base import BasePipeline from ..prompters import HunyuanVideoPrompter import torch import torchvision.transforms as transforms from einops import rearrange import numpy as np from PIL import Image from tqdm import tqdm class HunyuanVideoPipeline(BasePipeline): def __init__(self, device="cuda", torch_dtype=torch.float16): super().__init__(device=device, torch_dtype=torch_dtype) self.scheduler = FlowMatchScheduler(shift=7.0, sigma_min=0.0, extra_one_step=True) self.prompter = HunyuanVideoPrompter() self.text_encoder_1: SD3TextEncoder1 = None self.text_encoder_2: HunyuanVideoLLMEncoder = None self.dit: HunyuanVideoDiT = None self.vae_decoder: HunyuanVideoVAEDecoder = None self.vae_encoder: HunyuanVideoVAEEncoder = None self.model_names = ['text_encoder_1', 'text_encoder_2', 'dit', 'vae_decoder', 'vae_encoder'] self.vram_management = False def enable_vram_management(self): self.vram_management = True self.enable_cpu_offload() self.text_encoder_2.enable_auto_offload(dtype=self.torch_dtype, device=self.device) self.dit.enable_auto_offload(dtype=self.torch_dtype, device=self.device) def fetch_models(self, model_manager: ModelManager): self.text_encoder_1 = model_manager.fetch_model("sd3_text_encoder_1") self.text_encoder_2 = model_manager.fetch_model("hunyuan_video_text_encoder_2") self.dit = model_manager.fetch_model("hunyuan_video_dit") self.vae_decoder = model_manager.fetch_model("hunyuan_video_vae_decoder") self.vae_encoder = model_manager.fetch_model("hunyuan_video_vae_encoder") self.prompter.fetch_models(self.text_encoder_1, self.text_encoder_2) @staticmethod def from_model_manager(model_manager: ModelManager, torch_dtype=None, device=None, enable_vram_management=True): if device is None: device = model_manager.device if torch_dtype is None: torch_dtype = model_manager.torch_dtype pipe = HunyuanVideoPipeline(device=device, torch_dtype=torch_dtype) pipe.fetch_models(model_manager) if enable_vram_management: pipe.enable_vram_management() return pipe def generate_crop_size_list(self, base_size=256, patch_size=32, max_ratio=4.0): num_patches = round((base_size / patch_size)**2) assert max_ratio >= 1.0 crop_size_list = [] wp, hp = num_patches, 1 while wp > 0: if max(wp, hp) / min(wp, hp) <= max_ratio: crop_size_list.append((wp * patch_size, hp * patch_size)) if (hp + 1) * wp <= num_patches: hp += 1 else: wp -= 1 return crop_size_list def get_closest_ratio(self, height: float, width: float, ratios: list, buckets: list): aspect_ratio = float(height) / float(width) closest_ratio_id = np.abs(ratios - aspect_ratio).argmin() closest_ratio = min(ratios, key=lambda ratio: abs(float(ratio) - aspect_ratio)) return buckets[closest_ratio_id], float(closest_ratio) def prepare_vae_images_inputs(self, semantic_images, i2v_resolution="720p"): if i2v_resolution == "720p": bucket_hw_base_size = 960 elif i2v_resolution == "540p": bucket_hw_base_size = 720 elif i2v_resolution == "360p": bucket_hw_base_size = 480 else: raise ValueError(f"i2v_resolution: {i2v_resolution} must be in [360p, 540p, 720p]") origin_size = semantic_images[0].size crop_size_list = self.generate_crop_size_list(bucket_hw_base_size, 32) aspect_ratios = np.array([round(float(h) / float(w), 5) for h, w in crop_size_list]) closest_size, closest_ratio = self.get_closest_ratio(origin_size[1], origin_size[0], aspect_ratios, crop_size_list) ref_image_transform = transforms.Compose([ transforms.Resize(closest_size), transforms.CenterCrop(closest_size), transforms.ToTensor(), transforms.Normalize([0.5], [0.5]) ]) semantic_image_pixel_values = [ref_image_transform(semantic_image) for semantic_image in semantic_images] semantic_image_pixel_values = torch.cat(semantic_image_pixel_values).unsqueeze(0).unsqueeze(2).to(self.device) target_height, target_width = closest_size return semantic_image_pixel_values, target_height, target_width def encode_prompt(self, prompt, positive=True, clip_sequence_length=77, llm_sequence_length=256, input_images=None): prompt_emb, pooled_prompt_emb, text_mask = self.prompter.encode_prompt( prompt, device=self.device, positive=positive, clip_sequence_length=clip_sequence_length, llm_sequence_length=llm_sequence_length, images=input_images ) return {"prompt_emb": prompt_emb, "pooled_prompt_emb": pooled_prompt_emb, "text_mask": text_mask} def prepare_extra_input(self, latents=None, guidance=1.0): freqs_cos, freqs_sin = self.dit.prepare_freqs(latents) guidance = torch.Tensor([guidance] * latents.shape[0]).to(device=latents.device, dtype=latents.dtype) return {"freqs_cos": freqs_cos, "freqs_sin": freqs_sin, "guidance": guidance} def tensor2video(self, frames): frames = rearrange(frames, "C T H W -> T H W C") frames = ((frames.float() + 1) * 127.5).clip(0, 255).cpu().numpy().astype(np.uint8) frames = [Image.fromarray(frame) for frame in frames] return frames def encode_video(self, frames, tile_size=(17, 30, 30), tile_stride=(12, 20, 20)): tile_size = ((tile_size[0] - 1) * 4 + 1, tile_size[1] * 8, tile_size[2] * 8) tile_stride = (tile_stride[0] * 4, tile_stride[1] * 8, tile_stride[2] * 8) latents = self.vae_encoder.encode_video(frames, tile_size=tile_size, tile_stride=tile_stride) return latents @torch.no_grad() def __call__( self, prompt, negative_prompt="", input_video=None, input_images=None, i2v_resolution="720p", i2v_stability=True, denoising_strength=1.0, seed=None, rand_device=None, height=720, width=1280, num_frames=129, embedded_guidance=6.0, cfg_scale=1.0, num_inference_steps=30, tea_cache_l1_thresh=None, tile_size=(17, 30, 30), tile_stride=(12, 20, 20), step_processor=None, progress_bar_cmd=lambda x: x, progress_bar_st=None, ): # Tiler parameters tiler_kwargs = {"tile_size": tile_size, "tile_stride": tile_stride} # Scheduler self.scheduler.set_timesteps(num_inference_steps, denoising_strength) # encoder input images if input_images is not None: self.load_models_to_device(['vae_encoder']) image_pixel_values, height, width = self.prepare_vae_images_inputs(input_images, i2v_resolution=i2v_resolution) with torch.autocast(device_type=self.device, dtype=torch.float16, enabled=True): image_latents = self.vae_encoder(image_pixel_values) # Initialize noise rand_device = self.device if rand_device is None else rand_device noise = self.generate_noise((1, 16, (num_frames - 1) // 4 + 1, height//8, width//8), seed=seed, device=rand_device, dtype=self.torch_dtype).to(self.device) if input_video is not None: self.load_models_to_device(['vae_encoder']) input_video = self.preprocess_images(input_video) input_video = torch.stack(input_video, dim=2) latents = self.encode_video(input_video, **tiler_kwargs).to(dtype=self.torch_dtype, device=self.device) latents = self.scheduler.add_noise(latents, noise, timestep=self.scheduler.timesteps[0]) elif input_images is not None and i2v_stability: noise = self.generate_noise((1, 16, (num_frames - 1) // 4 + 1, height//8, width//8), seed=seed, device=rand_device, dtype=image_latents.dtype).to(self.device) t = torch.tensor([0.999]).to(device=self.device) latents = noise * t + image_latents.repeat(1, 1, (num_frames - 1) // 4 + 1, 1, 1) * (1 - t) latents = latents.to(dtype=image_latents.dtype) else: latents = noise # Encode prompts # current mllm does not support vram_management self.load_models_to_device(["text_encoder_1"] if self.vram_management and input_images is None else ["text_encoder_1", "text_encoder_2"]) prompt_emb_posi = self.encode_prompt(prompt, positive=True, input_images=input_images) if cfg_scale != 1.0: prompt_emb_nega = self.encode_prompt(negative_prompt, positive=False) # Extra input extra_input = self.prepare_extra_input(latents, guidance=embedded_guidance) # TeaCache tea_cache_kwargs = {"tea_cache": TeaCache(num_inference_steps, rel_l1_thresh=tea_cache_l1_thresh) if tea_cache_l1_thresh is not None else None} # Denoise self.load_models_to_device([] if self.vram_management else ["dit"]) for progress_id, timestep in enumerate(progress_bar_cmd(self.scheduler.timesteps)): timestep = timestep.unsqueeze(0).to(self.device) print(f"Step {progress_id + 1} / {len(self.scheduler.timesteps)}") forward_func = lets_dance_hunyuan_video if input_images is not None: latents = torch.concat([image_latents, latents[:, :, 1:, :, :]], dim=2) forward_func = lets_dance_hunyuan_video_i2v # Inference with torch.autocast(device_type=self.device, dtype=self.torch_dtype): noise_pred_posi = forward_func(self.dit, latents, timestep, **prompt_emb_posi, **extra_input, **tea_cache_kwargs) if cfg_scale != 1.0: noise_pred_nega = forward_func(self.dit, latents, timestep, **prompt_emb_nega, **extra_input) noise_pred = noise_pred_nega + cfg_scale * (noise_pred_posi - noise_pred_nega) else: noise_pred = noise_pred_posi # (Experimental feature, may be removed in the future) if step_processor is not None: self.load_models_to_device(['vae_decoder']) rendered_frames = self.scheduler.step(noise_pred, self.scheduler.timesteps[progress_id], latents, to_final=True) rendered_frames = self.vae_decoder.decode_video(rendered_frames, **tiler_kwargs) rendered_frames = self.tensor2video(rendered_frames[0]) rendered_frames = step_processor(rendered_frames, original_frames=input_video) self.load_models_to_device(['vae_encoder']) rendered_frames = self.preprocess_images(rendered_frames) rendered_frames = torch.stack(rendered_frames, dim=2) target_latents = self.encode_video(rendered_frames).to(dtype=self.torch_dtype, device=self.device) noise_pred = self.scheduler.return_to_timestep(self.scheduler.timesteps[progress_id], latents, target_latents) self.load_models_to_device([] if self.vram_management else ["dit"]) # Scheduler if input_images is not None: latents = self.scheduler.step(noise_pred[:, :, 1:, :, :], self.scheduler.timesteps[progress_id], latents[:, :, 1:, :, :]) latents = torch.concat([image_latents, latents], dim=2) else: latents = self.scheduler.step(noise_pred, self.scheduler.timesteps[progress_id], latents) # Decode self.load_models_to_device(['vae_decoder']) frames = self.vae_decoder.decode_video(latents, **tiler_kwargs) self.load_models_to_device([]) frames = self.tensor2video(frames[0]) return frames class TeaCache: def __init__(self, num_inference_steps, rel_l1_thresh): self.num_inference_steps = num_inference_steps self.step = 0 self.accumulated_rel_l1_distance = 0 self.previous_modulated_input = None self.rel_l1_thresh = rel_l1_thresh self.previous_residual = None self.previous_hidden_states = None def check(self, dit: HunyuanVideoDiT, img, vec): img_ = img.clone() vec_ = vec.clone() img_mod1_shift, img_mod1_scale, _, _, _, _ = dit.double_blocks[0].component_a.mod(vec_).chunk(6, dim=-1) normed_inp = dit.double_blocks[0].component_a.norm1(img_) modulated_inp = normed_inp * (1 + img_mod1_scale.unsqueeze(1)) + img_mod1_shift.unsqueeze(1) if self.step == 0 or self.step == self.num_inference_steps - 1: should_calc = True self.accumulated_rel_l1_distance = 0 else: coefficients = [7.33226126e+02, -4.01131952e+02, 6.75869174e+01, -3.14987800e+00, 9.61237896e-02] rescale_func = np.poly1d(coefficients) self.accumulated_rel_l1_distance += rescale_func(((modulated_inp-self.previous_modulated_input).abs().mean() / self.previous_modulated_input.abs().mean()).cpu().item()) if self.accumulated_rel_l1_distance < self.rel_l1_thresh: should_calc = False else: should_calc = True self.accumulated_rel_l1_distance = 0 self.previous_modulated_input = modulated_inp self.step += 1 if self.step == self.num_inference_steps: self.step = 0 if should_calc: self.previous_hidden_states = img.clone() return not should_calc def store(self, hidden_states): self.previous_residual = hidden_states - self.previous_hidden_states self.previous_hidden_states = None def update(self, hidden_states): hidden_states = hidden_states + self.previous_residual return hidden_states def lets_dance_hunyuan_video( dit: HunyuanVideoDiT, x: torch.Tensor, t: torch.Tensor, prompt_emb: torch.Tensor = None, text_mask: torch.Tensor = None, pooled_prompt_emb: torch.Tensor = None, freqs_cos: torch.Tensor = None, freqs_sin: torch.Tensor = None, guidance: torch.Tensor = None, tea_cache: TeaCache = None, **kwargs ): B, C, T, H, W = x.shape vec = dit.time_in(t, dtype=torch.float32) + dit.vector_in(pooled_prompt_emb) + dit.guidance_in(guidance * 1000, dtype=torch.float32) img = dit.img_in(x) txt = dit.txt_in(prompt_emb, t, text_mask) # TeaCache if tea_cache is not None: tea_cache_update = tea_cache.check(dit, img, vec) else: tea_cache_update = False if tea_cache_update: print("TeaCache skip forward.") img = tea_cache.update(img) else: split_token = int(text_mask.sum(dim=1)) txt_len = int(txt.shape[1]) for block in tqdm(dit.double_blocks, desc="Double stream blocks"): img, txt = block(img, txt, vec, (freqs_cos, freqs_sin), split_token=split_token) x = torch.concat([img, txt], dim=1) for block in tqdm(dit.single_blocks, desc="Single stream blocks"): x = block(x, vec, (freqs_cos, freqs_sin), txt_len=txt_len, split_token=split_token) img = x[:, :-txt_len] if tea_cache is not None: tea_cache.store(img) img = dit.final_layer(img, vec) img = dit.unpatchify(img, T=T//1, H=H//2, W=W//2) return img def lets_dance_hunyuan_video_i2v( dit: HunyuanVideoDiT, x: torch.Tensor, t: torch.Tensor, prompt_emb: torch.Tensor = None, text_mask: torch.Tensor = None, pooled_prompt_emb: torch.Tensor = None, freqs_cos: torch.Tensor = None, freqs_sin: torch.Tensor = None, guidance: torch.Tensor = None, tea_cache: TeaCache = None, **kwargs ): B, C, T, H, W = x.shape # Uncomment below to keep same as official implementation # guidance = guidance.to(dtype=torch.float32).to(torch.bfloat16) vec = dit.time_in(t, dtype=torch.bfloat16) vec_2 = dit.vector_in(pooled_prompt_emb) vec = vec + vec_2 vec = vec + dit.guidance_in(guidance * 1000., dtype=torch.bfloat16) token_replace_vec = dit.time_in(torch.zeros_like(t), dtype=torch.bfloat16) tr_token = (H // 2) * (W // 2) token_replace_vec = token_replace_vec + vec_2 img = dit.img_in(x) txt = dit.txt_in(prompt_emb, t, text_mask) # TeaCache if tea_cache is not None: tea_cache_update = tea_cache.check(dit, img, vec) else: tea_cache_update = False if tea_cache_update: print("TeaCache skip forward.") img = tea_cache.update(img) else: split_token = int(text_mask.sum(dim=1)) txt_len = int(txt.shape[1]) for block in tqdm(dit.double_blocks, desc="Double stream blocks"): img, txt = block(img, txt, vec, (freqs_cos, freqs_sin), token_replace_vec, tr_token, split_token) x = torch.concat([img, txt], dim=1) for block in tqdm(dit.single_blocks, desc="Single stream blocks"): x = block(x, vec, (freqs_cos, freqs_sin), txt_len, token_replace_vec, tr_token, split_token) img = x[:, :-txt_len] if tea_cache is not None: tea_cache.store(img) img = dit.final_layer(img, vec) img = dit.unpatchify(img, T=T//1, H=H//2, W=W//2) return img