Spaces:

tencent
/

HunyuanImage-2.1

Running on A100

App Files Files Community

HunyuanImage-2.1 / hyimage /diffusion /pipelines /hunyuanimage_pipeline.py

KevinNg99

default shift

23c68ff about 15 hours ago

raw

history blame contribute delete

35 kB

	import re
	import os
	from dataclasses import dataclass
	from typing import Optional
	from einops import rearrange

	from tqdm import tqdm
	import loguru
	import torch
	from hyimage.common.config.lazy import DictConfig
	from PIL import Image

	from hyimage.common.config import instantiate
	from hyimage.common.constants import PRECISION_TO_TYPE
	from hyimage.common.format_prompt import MultilingualPromptFormat
	from hyimage.models.text_encoder import PROMPT_TEMPLATE
	from hyimage.models.model_zoo import HUNYUANIMAGE_REPROMPT
	from hyimage.models.text_encoder.byT5 import load_glyph_byT5_v2
	from hyimage.models.hunyuan.modules.hunyuanimage_dit import load_hunyuan_dit_state_dict
	from hyimage.diffusion.cfg_utils import AdaptiveProjectedGuidance, rescale_noise_cfg


	@dataclass
	class HunyuanImagePipelineConfig:
	"""
	Configuration class for HunyuanImage diffusion pipeline.

	This dataclass consolidates all configuration parameters for the pipeline,
	including model configurations (DiT, VAE, text encoder) and pipeline
	parameters (sampling steps, guidance scale, etc.).
	"""

	# Model configurations
	dit_config: DictConfig
	vae_config: DictConfig
	text_encoder_config: DictConfig
	reprompt_config: DictConfig
	refiner_model_name: str = "hunyuanimage-refiner"

	enable_dit_offloading: bool = True
	enable_reprompt_model_offloading: bool = True
	enable_refiner_offloading: bool = True

	cfg_mode: str = "MIX_mode_0"
	guidance_rescale: float = 0.0

	# Pipeline parameters
	default_sampling_steps: int = 50
	# Default guidance scale, will be overridden by the guidance_scale parameter in __call__
	default_guidance_scale: float = 3.5
	# Inference shift
	shift: int = 5
	torch_dtype: str = "bf16"
	device: str = "cuda"
	version: str = ""

	@classmethod
	def create_default(cls, version: str = "v2.1", use_distilled: bool = False, **kwargs):
	"""
	Create a default configuration for specified HunyuanImage version.

	Args:
	version: HunyuanImage version, only "v2.1" is supported
	use_distilled: Whether to use distilled model
	**kwargs: Additional configuration options
	"""
	if version == "v2.1":
	from hyimage.models.model_zoo import (
	HUNYUANIMAGE_V2_1_DIT,
	HUNYUANIMAGE_V2_1_DIT_CFG_DISTILL,
	HUNYUANIMAGE_V2_1_VAE_32x,
	HUNYUANIMAGE_V2_1_TEXT_ENCODER,
	)
	dit_config = HUNYUANIMAGE_V2_1_DIT_CFG_DISTILL() if use_distilled else HUNYUANIMAGE_V2_1_DIT()
	return cls(
	dit_config=dit_config,
	vae_config=HUNYUANIMAGE_V2_1_VAE_32x(),
	text_encoder_config=HUNYUANIMAGE_V2_1_TEXT_ENCODER(),
	reprompt_config=HUNYUANIMAGE_REPROMPT(),
	shift=4 if use_distilled else 5,
	default_guidance_scale=3.25 if use_distilled else 3.5,
	default_sampling_steps=8 if use_distilled else 50,
	version=version,
	**kwargs
	)
	else:
	raise ValueError(f"Unsupported HunyuanImage version: {version}. Only 'v2.1' is supported")


	class HunyuanImagePipeline:
	"""
	User-friendly pipeline for HunyuanImage text-to-image generation.

	This pipeline provides a simple interface similar to diffusers library
	for generating high-quality images from text prompts.

	Supports HunyuanImage 2.1 version with automatic configuration.
	Both default and distilled (CFG distillation) models are supported.
	"""

	def __init__(
	self,
	config: HunyuanImagePipelineConfig,
	**kwargs
	):
	"""
	Initialize the HunyuanImage diffusion pipeline.

	Args:
	config: Configuration object containing all model and pipeline settings
	**kwargs: Additional configuration options
	"""
	self.config = config
	self.default_sampling_steps = config.default_sampling_steps
	self.default_guidance_scale = config.default_guidance_scale
	self.shift = config.shift
	self.torch_dtype = PRECISION_TO_TYPE[config.torch_dtype]
	self.device = config.device
	self.execution_device = config.device

	self.dit = None
	self.text_encoder = None
	self.vae = None
	self.byt5_kwargs = None
	self.prompt_format = None

	self.enable_dit_offloading = config.enable_dit_offloading
	self.enable_reprompt_model_offloading = config.enable_reprompt_model_offloading
	self.enable_refiner_offloading = config.enable_refiner_offloading


	self.cfg_mode = config.cfg_mode
	self.guidance_rescale = config.guidance_rescale

	if self.cfg_mode == "APG_mode_0":
	self.cfg_guider = AdaptiveProjectedGuidance(guidance_scale=10.0, eta=0.0,
	adaptive_projected_guidance_rescale=10.0,
	adaptive_projected_guidance_momentum=-0.5)
	self.apg_start_step = 10
	elif self.cfg_mode == "MIX_mode_0":
	self.cfg_guider_ocr = AdaptiveProjectedGuidance(guidance_scale=10.0, eta=0.0,
	adaptive_projected_guidance_rescale=10.0,
	adaptive_projected_guidance_momentum=-0.5)
	self.apg_start_step_ocr = 75

	self.cfg_guider_general = AdaptiveProjectedGuidance(guidance_scale=10.0, eta=0.0,
	adaptive_projected_guidance_rescale=10.0,
	adaptive_projected_guidance_momentum=-0.5)
	self.apg_start_step_general = 10

	self.ocr_mask = []


	self._load_models()

	def _load_dit(self):
	try:
	dit_config = self.config.dit_config
	self.dit = instantiate(dit_config.model)
	if dit_config.load_from:
	load_hunyuan_dit_state_dict(self.dit, dit_config.load_from, strict=True)
	else:
	raise ValueError("Must provide checkpoint path for DiT model")
	self.dit = self.dit.to(self.device, dtype=self.torch_dtype)
	self.dit.eval()
	if getattr(dit_config, "use_compile", False):
	self.dit = torch.compile(self.dit)
	loguru.logger.info("✓ DiT model loaded")
	except Exception as e:
	raise RuntimeError(f"Error loading DiT model: {e}") from e

	def _load_text_encoder(self):
	try:
	text_encoder_config = self.config.text_encoder_config
	if not text_encoder_config.load_from:
	raise ValueError("Must provide checkpoint path for text encoder")

	if text_encoder_config.prompt_template is not None:
	prompt_template = PROMPT_TEMPLATE[text_encoder_config.prompt_template]
	crop_start = prompt_template.get("crop_start", 0)
	else:
	crop_start = 0
	prompt_template = None
	max_length = text_encoder_config.text_len + crop_start

	self.text_encoder = instantiate(
	text_encoder_config.model,
	max_length=max_length,
	text_encoder_path=os.path.join(text_encoder_config.load_from, "llm"),
	prompt_template=prompt_template,
	logger=None,
	device=self.device,
	)
	loguru.logger.info("✓ HunyuanImage text encoder loaded")
	except Exception as e:
	raise RuntimeError(f"Error loading text encoder: {e}") from e

	def _load_vae(self):
	try:
	vae_config = self.config.vae_config
	self.vae = instantiate(
	vae_config.model,
	vae_path=vae_config.load_from,
	)
	self.vae = self.vae.to(self.device)
	loguru.logger.info("✓ VAE loaded")
	except Exception as e:
	raise RuntimeError(f"Error loading VAE: {e}") from e

	def _load_reprompt_model(self):
	try:
	reprompt_config = self.config.reprompt_config
	self._reprompt_model = instantiate(reprompt_config.model, models_root_path=reprompt_config.load_from, enable_offloading=self.enable_reprompt_model_offloading)
	loguru.logger.info("✓ Reprompt model loaded")
	except Exception as e:
	raise RuntimeError(f"Error loading reprompt model: {e}") from e

	@property
	def refiner_pipeline(self):
	"""
	As the refiner model is an optional component, we load it on demand.
	"""
	if hasattr(self, '_refiner_pipeline') and self._refiner_pipeline is not None:
	return self._refiner_pipeline
	from hyimage.diffusion.pipelines.hunyuanimage_refiner_pipeline import HunYuanImageRefinerPipeline
	self._refiner_pipeline = HunYuanImageRefinerPipeline.from_pretrained(self.config.refiner_model_name)
	return self._refiner_pipeline

	@property
	def reprompt_model(self):
	"""
	As the reprompt model is an optional component, we load it on demand.
	"""
	if hasattr(self, '_reprompt_model') and self._reprompt_model is not None:
	return self._reprompt_model
	self._load_reprompt_model()
	return self._reprompt_model

	def _load_byt5(self):

	assert self.dit is not None, "DiT model must be loaded before byT5"

	if not self.use_byt5:
	self.byt5_kwargs = None
	self.prompt_format = None
	return

	try:

	text_encoder_config = self.config.text_encoder_config

	glyph_root = os.path.join(self.config.text_encoder_config.load_from, "Glyph-SDXL-v2")
	if not os.path.exists(glyph_root):
	raise RuntimeError(
	f"Glyph checkpoint not found from '{glyph_root}'. \n"
	"Please download from https://modelscope.cn/models/AI-ModelScope/Glyph-SDXL-v2/files.\n\n"
	"- Required files:\n"
	" Glyph-SDXL-v2\n"
	" ├── assets\n"
	" │ ├── color_idx.json\n"
	" │ └── multilingual_10-lang_idx.json\n"
	" └── checkpoints\n"
	" └── byt5_model.pt\n"
	)


	byT5_google_path = os.path.join(text_encoder_config.load_from, "byt5-small")
	if not os.path.exists(byT5_google_path):
	loguru.logger.warning(f"ByT5 google path not found from: {byT5_google_path}. Try downloading from https://huggingface.co/google/byt5-small.")
	byT5_google_path = "google/byt5-small"


	multilingual_prompt_format_color_path = os.path.join(glyph_root, "assets/color_idx.json")
	multilingual_prompt_format_font_path = os.path.join(glyph_root, "assets/multilingual_10-lang_idx.json")

	byt5_args = dict(
	byT5_google_path=byT5_google_path,
	byT5_ckpt_path=os.path.join(glyph_root, "checkpoints/byt5_model.pt"),
	multilingual_prompt_format_color_path=multilingual_prompt_format_color_path,
	multilingual_prompt_format_font_path=multilingual_prompt_format_font_path,
	byt5_max_length=128
	)

	self.byt5_kwargs = load_glyph_byT5_v2(byt5_args, device=self.device)
	self.prompt_format = MultilingualPromptFormat(
	font_path=multilingual_prompt_format_font_path,
	color_path=multilingual_prompt_format_color_path
	)
	loguru.logger.info("✓ byT5 glyph processor loaded")
	except Exception as e:
	raise RuntimeError("Error loading byT5 glyph processor") from e

	def _load_models(self):
	"""
	Load all model components.
	"""
	loguru.logger.info("Loading HunyuanImage models...")
	self._load_vae()
	self._load_dit()
	self._load_byt5()
	self._load_text_encoder()


	def _encode_text(self, prompt: str, data_type: str = "image"):
	"""
	Encode text prompt to embeddings.

	Args:
	prompt: The text prompt
	data_type: The type of data ("image" by default)

	Returns:
	Tuple of (text_emb, text_mask)
	"""
	text_inputs = self.text_encoder.text2tokens(prompt)
	with torch.no_grad():
	text_outputs = self.text_encoder.encode(
	text_inputs,
	data_type=data_type,
	)
	text_emb = text_outputs.hidden_state
	text_mask = text_outputs.attention_mask
	return text_emb, text_mask

	def _encode_glyph(self, prompt: str):
	"""
	Encode glyph information using byT5.

	Args:
	prompt: The text prompt

	Returns:
	Tuple of (byt5_emb, byt5_mask)
	"""
	if not self.use_byt5:
	return None, None

	if not prompt:
	return (
	torch.zeros((1, self.byt5_kwargs["byt5_max_length"], 1472), device=self.device),
	torch.zeros((1, self.byt5_kwargs["byt5_max_length"]), device=self.device, dtype=torch.int64)
	)

	try:
	text_prompt_texts = []
	pattern_quote_double = r'\"(.*?)\"'
	pattern_quote_chinese_single = r'‘(.*?)’'
	pattern_quote_chinese_double = r'“(.*?)”'

	matches_quote_double = re.findall(pattern_quote_double, prompt)
	matches_quote_chinese_single = re.findall(pattern_quote_chinese_single, prompt)
	matches_quote_chinese_double = re.findall(pattern_quote_chinese_double, prompt)

	text_prompt_texts.extend(matches_quote_double)
	text_prompt_texts.extend(matches_quote_chinese_single)
	text_prompt_texts.extend(matches_quote_chinese_double)

	if not text_prompt_texts:
	self.ocr_mask = [False]
	return (
	torch.zeros((1, self.byt5_kwargs["byt5_max_length"], 1472), device=self.device),
	torch.zeros((1, self.byt5_kwargs["byt5_max_length"]), device=self.device, dtype=torch.int64)
	)
	self.ocr_mask = [True]

	text_prompt_style_list = [{'color': None, 'font-family': None} for _ in range(len(text_prompt_texts))]
	glyph_text_formatted = self.prompt_format.format_prompt(text_prompt_texts, text_prompt_style_list)

	byt5_text_ids, byt5_text_mask = self._get_byt5_text_tokens(
	self.byt5_kwargs["byt5_tokenizer"],
	self.byt5_kwargs["byt5_max_length"],
	glyph_text_formatted
	)

	byt5_text_ids = byt5_text_ids.to(device=self.device)
	byt5_text_mask = byt5_text_mask.to(device=self.device)

	byt5_prompt_embeds = self.byt5_kwargs["byt5_model"](
	byt5_text_ids, attention_mask=byt5_text_mask.float()
	)
	byt5_emb = byt5_prompt_embeds[0]

	return byt5_emb, byt5_text_mask
	except Exception as e:
	loguru.logger.warning(f"Warning: Error in glyph encoding, using fallback: {e}")
	return (
	torch.zeros((1, self.byt5_kwargs["byt5_max_length"], 1472), device=self.device),
	torch.zeros((1, self.byt5_kwargs["byt5_max_length"]), device=self.device, dtype=torch.int64)
	)

	def _get_byt5_text_tokens(self, tokenizer, max_length, text_list):
	"""
	Get byT5 text tokens.

	Args:
	tokenizer: The tokenizer object
	max_length: Maximum token length
	text_list: List or string of text

	Returns:
	Tuple of (byt5_text_ids, byt5_text_mask)
	"""
	if isinstance(text_list, list):
	text_prompt = " ".join(text_list)
	else:
	text_prompt = text_list

	byt5_text_inputs = tokenizer(
	text_prompt,
	padding="max_length",
	max_length=max_length,
	truncation=True,
	add_special_tokens=True,
	return_tensors="pt",
	)

	byt5_text_ids = byt5_text_inputs.input_ids
	byt5_text_mask = byt5_text_inputs.attention_mask

	return byt5_text_ids, byt5_text_mask

	def _prepare_latents(self, width: int, height: int, generator: torch.Generator, batch_size: int = 1, vae_downsampling_factor: int = 32):
	"""
	Prepare initial noise latents.

	Args:
	width: Image width
	height: Image height
	generator: Torch random generator
	batch_size: Batch size

	Returns:
	Latent tensor
	"""
	assert width % vae_downsampling_factor == 0 and height % vae_downsampling_factor == 0, (
	f"width and height must be divisible by {vae_downsampling_factor}, but got {width} and {height}"
	)
	latent_width = width // vae_downsampling_factor
	latent_height = height // vae_downsampling_factor
	latent_channels = 64

	if len(self.dit.patch_size) == 3:
	latent_shape = (batch_size, latent_channels, 1, latent_height, latent_width)
	elif len(self.dit.patch_size) == 2:
	latent_shape = (batch_size, latent_channels, latent_height, latent_width)
	else:
	raise ValueError(f"Unsupported patch_size: {self.dit.patch_size}")


	# Generate random noise with shape latent_shape
	latents = torch.randn(
	latent_shape,
	device=generator.device,
	dtype=self.torch_dtype,
	generator=generator,
	).to(device=self.device)

	return latents

	def _denoise_step(self, latents, timesteps, text_emb, text_mask, byt5_emb, byt5_mask, guidance_scale: float = 1.0, timesteps_r=None):
	"""
	Perform one denoising step.

	Args:
	latents: Latent tensor
	timesteps: Timesteps tensor
	text_emb: Text embedding
	text_mask: Text mask
	byt5_emb: byT5 embedding
	byt5_mask: byT5 mask
	guidance_scale: Guidance scale
	timesteps_r: Optional next timestep

	Returns:
	Noise prediction tensor
	"""
	if byt5_emb is not None and byt5_mask is not None:
	extra_kwargs = {
	"byt5_text_states": byt5_emb,
	"byt5_text_mask": byt5_mask,
	}
	else:
	if self.use_byt5:
	raise ValueError("Must provide byt5_emb and byt5_mask for HunyuanImage 2.1")
	extra_kwargs = {}

	with torch.no_grad(), torch.autocast(device_type="cuda", dtype=torch.bfloat16, enabled=True):
	if hasattr(self.dit, 'guidance_embed') and self.dit.guidance_embed:
	guidance_expand = torch.tensor(
	[guidance_scale] * latents.shape[0],
	dtype=torch.float32,
	device=latents.device
	).to(latents.dtype) * 1000
	else:
	guidance_expand = None

	noise_pred = self.dit(
	latents,
	timesteps,
	text_states=text_emb,
	encoder_attention_mask=text_mask,
	guidance=guidance_expand,
	return_dict=False,
	extra_kwargs=extra_kwargs,
	timesteps_r=timesteps_r,
	)[0]

	return noise_pred

	def _apply_classifier_free_guidance(self, noise_pred, guidance_scale: float, i: int):
	"""
	Apply classifier-free guidance.

	Args:
	noise_pred: Noise prediction tensor
	guidance_scale: Guidance scale

	Returns:
	Guided noise prediction tensor
	"""
	if guidance_scale == 1.0:
	return noise_pred

	noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)


	if self.cfg_mode.startswith("APG_mode_"):
	if i <= self.apg_start_step:
	noise_pred = noise_pred_uncond + guidance_scale * (
	noise_pred_text - noise_pred_uncond
	)
	_ = self.cfg_guider(noise_pred_text, noise_pred_uncond, step=i)
	else:
	noise_pred = self.cfg_guider(noise_pred_text, noise_pred_uncond, step=i)
	elif self.cfg_mode.startswith("MIX_mode_"):

	ocr_mask_bool = torch.tensor(self.ocr_mask, dtype=torch.bool)

	true_idx = torch.where(ocr_mask_bool)[0]
	false_idx = torch.where(~ocr_mask_bool)[0]

	noise_pred_text_true = noise_pred_text[true_idx] if len(true_idx) > 0 else \
	torch.empty((0, noise_pred_text.size(1)), dtype=noise_pred_text.dtype, device=noise_pred_text.device)
	noise_pred_text_false = noise_pred_text[false_idx] if len(false_idx) > 0 else \
	torch.empty((0, noise_pred_text.size(1)), dtype=noise_pred_text.dtype, device=noise_pred_text.device)

	noise_pred_uncond_true = noise_pred_uncond[true_idx] if len(true_idx) > 0 else \
	torch.empty((0, noise_pred_uncond.size(1)), dtype=noise_pred_uncond.dtype, device=noise_pred_uncond.device)
	noise_pred_uncond_false = noise_pred_uncond[false_idx] if len(false_idx) > 0 else \
	torch.empty((0, noise_pred_uncond.size(1)), dtype=noise_pred_uncond.dtype, device=noise_pred_uncond.device)

	if len(noise_pred_text_true) > 0:
	if i <= self.apg_start_step_ocr:
	noise_pred_true = noise_pred_uncond_true + guidance_scale * (
	noise_pred_text_true - noise_pred_uncond_true
	)
	_ = self.cfg_guider_ocr(noise_pred_text_true, noise_pred_uncond_true, step=i)
	else:
	noise_pred_true = self.cfg_guider_ocr(noise_pred_text_true, noise_pred_uncond_true, step=i)
	else:
	noise_pred_true = noise_pred_text_true

	if len(noise_pred_text_false) > 0:
	if i <= self.apg_start_step_general:
	noise_pred_false = noise_pred_uncond_false + guidance_scale * (
	noise_pred_text_false - noise_pred_uncond_false
	)
	_ = self.cfg_guider_general(noise_pred_text_false, noise_pred_uncond_false, step=i)
	else:
	noise_pred_false = self.cfg_guider_general(noise_pred_text_false, noise_pred_uncond_false, step=i)
	else:
	noise_pred_false = noise_pred_text_false

	noise_pred = torch.empty_like(noise_pred_text)
	if len(true_idx) > 0:
	noise_pred[true_idx] = noise_pred_true
	if len(false_idx) > 0:
	noise_pred[false_idx] = noise_pred_false

	else:
	noise_pred = noise_pred_uncond + guidance_scale * (
	noise_pred_text - noise_pred_uncond
	)

	if self.do_classifier_free_guidance and self.guidance_rescale > 0.0:
	# Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
	noise_pred = rescale_noise_cfg(
	noise_pred,
	noise_pred_text,
	guidance_rescale=self.guidance_rescale,
	)

	return noise_pred

	def _decode_latents(self, latents, reorg_tokens=False):
	"""
	Decode latents to images using VAE.

	Args:
	latents: Latent tensor

	Returns:
	Image tensor
	"""
	if hasattr(self.vae.config, "shift_factor") and self.vae.config.shift_factor:
	latents = latents / self.vae.config.scaling_factor + self.vae.config.shift_factor
	else:
	latents = latents / self.vae.config.scaling_factor

	if reorg_tokens:
	latents = rearrange(latents, "b c f h w -> b f c h w")
	latents = rearrange(latents, "b f (n c) h w -> b (f n) c h w", n=2)
	latents = rearrange(latents, "b f c h w -> b c f h w")
	latents = latents[:, :, 1:]

	if latents.ndim == 5:
	latents = latents.squeeze(2)
	if latents.ndim == 4:
	latents = latents.unsqueeze(2)

	with torch.autocast(device_type="cuda", dtype=torch.float16, enabled=True):
	image = self.vae.decode(latents, return_dict=False)[0]

	# Post-process image - remove frame dimension and normalize
	image = (image / 2 + 0.5).clamp(0, 1)
	image = image[:, :, 0] # Remove frame dimension for images
	image = image.cpu().float()

	return image

	def get_timesteps_sigmas(self, sampling_steps: int, shift):
	sigmas = torch.linspace(1, 0, sampling_steps + 1)
	sigmas = (shift * sigmas) / (1 + (shift - 1) * sigmas)
	sigmas = sigmas.to(torch.float32)
	timesteps = (sigmas[:-1] * 1000).to(dtype=torch.float32, device=self.device)
	return timesteps, sigmas

	def step(self, latents, noise_pred, sigmas, step_i):
	return latents.float() - (sigmas[step_i] - sigmas[step_i + 1]) * noise_pred.float()

	@torch.no_grad()
	def __call__(
	self,
	prompt: str,
	shift: int = 4,
	negative_prompt: str = "",
	width: int = 2048,
	height: int = 2048,
	use_reprompt: bool = False,
	use_refiner: bool = False,
	num_inference_steps: Optional[int] = None,
	guidance_scale: Optional[float] = None,
	seed: Optional[int] = 42,
	**kwargs
	) -> Image.Image:
	"""
	Generate an image from a text prompt.

	Args:
	prompt: Text prompt describing the image
	negative_prompt: Negative prompt for guidance
	width: Image width
	height: Image height
	use_reprompt: Whether to use reprompt model
	use_refiner: Whether to use refiner pipeline
	num_inference_steps: Number of denoising steps (overrides config if provided)
	guidance_scale: Strength of classifier-free guidance (overrides config if provided)
	seed: Random seed for reproducibility
	**kwargs: Additional arguments

	Returns:
	Generated PIL Image
	"""
	if seed is not None:
	generator = torch.Generator(device='cpu').manual_seed(seed)
	torch.manual_seed(seed)
	else:
	generator = None

	sampling_steps = num_inference_steps if num_inference_steps is not None else self.default_sampling_steps
	guidance_scale = guidance_scale if guidance_scale is not None else self.default_guidance_scale
	shift = shift if shift is not None else self.shift

	user_prompt = prompt
	if use_reprompt:
	if self.enable_dit_offloading:
	self.to('cpu')
	prompt = self.reprompt_model.predict(prompt)
	if self.enable_dit_offloading:
	self.to(self.execution_device)

	print("=" * 60)
	print("🖼️ HunyuanImage Generation Task")
	print("-" * 60)
	print(f"Prompt: {user_prompt}")
	if use_reprompt:
	print(f"Reprompt: {prompt}")
	if not self.cfg_distilled:
	print(f"Negative Prompt: {negative_prompt if negative_prompt else '(none)'}")
	print(f"Guidance Scale: {guidance_scale}")
	print(f"CFG Mode: {self.cfg_mode}")
	print(f"Guidance Rescale: {self.guidance_rescale}")
	print(f"Shift: {shift}")
	print(f"Seed: {seed}")
	print(f"Use MeanFlow: {self.use_meanflow}")
	print(f"Use byT5: {self.use_byt5}")
	print(f"Image Size: {width} x {height}")
	print(f"Sampling Steps: {sampling_steps}")
	print("=" * 60)

	pos_text_emb, pos_text_mask = self._encode_text(prompt)
	neg_text_emb, neg_text_mask = self._encode_text(negative_prompt)

	pos_byt5_emb, pos_byt5_mask = self._encode_glyph(prompt)
	neg_byt5_emb, neg_byt5_mask = self._encode_glyph(negative_prompt)

	latents = self._prepare_latents(width, height, generator=generator)

	do_classifier_free_guidance = (not self.cfg_distilled) and guidance_scale > 1
	if do_classifier_free_guidance:
	text_emb = torch.cat([neg_text_emb, pos_text_emb])
	text_mask = torch.cat([neg_text_mask, pos_text_mask])

	if self.use_byt5 and pos_byt5_emb is not None and neg_byt5_emb is not None:
	byt5_emb = torch.cat([neg_byt5_emb, pos_byt5_emb])
	byt5_mask = torch.cat([neg_byt5_mask, pos_byt5_mask])
	else:
	byt5_emb = pos_byt5_emb
	byt5_mask = pos_byt5_mask
	else:
	text_emb = pos_text_emb
	text_mask = pos_text_mask
	byt5_emb = pos_byt5_emb
	byt5_mask = pos_byt5_mask

	timesteps, sigmas = self.get_timesteps_sigmas(sampling_steps, shift)

	for i, t in enumerate(tqdm(timesteps, desc="Denoising", total=len(timesteps))):
	latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
	t_expand = t.repeat(latent_model_input.shape[0])
	if self.use_meanflow:
	if i == len(timesteps) - 1:
	timesteps_r = torch.tensor([0.0], device=self.device)
	else:
	timesteps_r = timesteps[i + 1]
	timesteps_r = timesteps_r.repeat(latent_model_input.shape[0])
	else:
	timesteps_r = None

	if self.cfg_distilled:
	noise_pred = self._denoise_step(
	latent_model_input, t_expand, text_emb, text_mask, byt5_emb, byt5_mask, guidance_scale, timesteps_r=timesteps_r,
	)
	else:
	noise_pred = self._denoise_step(
	latent_model_input, t_expand, text_emb, text_mask, byt5_emb, byt5_mask, timesteps_r=timesteps_r,
	)

	if do_classifier_free_guidance:
	noise_pred = self._apply_classifier_free_guidance(noise_pred, guidance_scale, i)

	latents = self.step(latents, noise_pred, sigmas, i)


	image = self._decode_latents(latents)
	image = (image.squeeze(0).permute(1, 2, 0) * 255).byte().numpy()
	pil_image = Image.fromarray(image)

	if use_refiner:
	if self.enable_dit_offloading:
	self.to('cpu')
	if self.enable_refiner_offloading:
	self.refiner_pipeline.to(self.execution_device)
	pil_image = self.refiner_pipeline(
	image=pil_image,
	prompt=prompt,
	negative_prompt=negative_prompt,
	width=width,
	height=height,
	use_reprompt=False,
	use_refiner=False,
	num_inference_steps=4,
	guidance_scale=guidance_scale,
	generator=generator,
	)
	if self.enable_refiner_offloading:
	self.refiner_pipeline.to('cpu')
	if self.enable_dit_offloading:
	self.to(self.execution_device)

	return pil_image

	@property
	def use_meanflow(self):
	return getattr(self.dit, 'use_meanflow', False)

	@property
	def use_byt5(self):
	return getattr(self.dit, 'glyph_byT5_v2', False)

	@property
	def cfg_distilled(self):
	return getattr(self.dit, 'guidance_embed', False)

	def to(self, device: str \| torch.device):
	"""
	Move pipeline to specified device.

	Args:
	device: Target device string

	Returns:
	Self
	"""
	self.device = device
	if self.dit is not None:
	self.dit = self.dit.to(device, non_blocking=True)
	# if self.text_encoder is not None:
	# self.text_encoder = self.text_encoder.to(device, non_blocking=True)
	if self.vae is not None:
	self.vae = self.vae.to(device, non_blocking=True)
	return self

	def update_config(self, **kwargs):
	"""
	Update configuration parameters.

	Args:
	**kwargs: Key-value pairs to update

	Returns:
	Self
	"""
	for key, value in kwargs.items():
	if hasattr(self.config, key):
	setattr(self.config, key, value)
	if hasattr(self, key):
	setattr(self, key, value)
	return self

	@classmethod
	def from_pretrained(cls, model_name: str = "hunyuanimage-v2.1", use_distilled: bool = False, **kwargs):
	"""
	Create pipeline from pretrained model.

	Args:
	model_name: Model name, supports "hunyuanimage-v2.1", "hunyuanimage-v2.1-distilled"
	use_distilled: Whether to use distilled model (overrides model_name if specified)
	**kwargs: Additional configuration options

	Returns:
	HunyuanImagePipeline instance
	"""
	if model_name == "hunyuanimage-v2.1":
	version = "v2.1"
	use_distilled = False
	elif model_name == "hunyuanimage-v2.1-distilled":
	version = "v2.1"
	use_distilled = True
	else:
	raise ValueError(
	f"Unsupported model name: {model_name}. Supported names: 'hunyuanimage-v2.1', 'hunyuanimage-v2.1-distilled'"
	)

	config = HunyuanImagePipelineConfig.create_default(
	version=version, use_distilled=use_distilled, **kwargs
	)
	return cls(config=config)

	@classmethod
	def from_config(cls, config: HunyuanImagePipelineConfig):
	"""
	Create pipeline from configuration object.

	Args:
	config: HunyuanImagePipelineConfig instance

	Returns:
	HunyuanImagePipeline instance
	"""
	return cls(config=config)


	def DiffusionPipeline(model_name: str = "hunyuanimage-v2.1", use_distilled: bool = False, **kwargs):
	"""
	Factory function to create HunyuanImagePipeline.

	Args:
	model_name: Model name, supports "hunyuanimage-v2.1", "hunyuanimage-v2.1-distilled"
	use_distilled: Whether to use distilled model (overrides model_name if specified)
	**kwargs: Additional configuration options

	Returns:
	HunyuanImagePipeline instance
	"""
	return HunyuanImagePipeline.from_pretrained(model_name, use_distilled=use_distilled, **kwargs)