Spaces:

adaface-neurips
/

adaface-animate

Running on Zero

App Files Files Community

adaface-neurips commited on May 22

Commit

737c1a0

1 Parent(s): 8776445

Update code

Browse files

Files changed (6) hide show

adaface/adaface_wrapper.py +34 -21
adaface/diffusers_attn_lora_capture.py +67 -62
adaface/face_id_to_ada_prompt.py +14 -16
adaface/unet_teachers.py +37 -36
adaface/util.py +6 -6
app.py +62 -41

adaface/adaface_wrapper.py CHANGED Viewed

@@ -30,7 +30,7 @@ class AdaFaceWrapper(nn.Module):
                  use_840k_vae=False, use_ds_text_encoder=False,
                  main_unet_filepath=None, unet_types=None, extra_unet_dirpaths=None, unet_weights_in_ensemble=None,
                  enable_static_img_suffix_embs=None, unet_uses_attn_lora=False,
-                 attn_lora_layer_names=['q', 'k', 'v', 'out'], shrink_cross_attn=False, q_lora_updates_query=False,
                  device='cuda', is_training=False):
         '''
         pipeline_name: "text2img", "text2imgxl", "img2img", "text2img3", "flux", or None.
@@ -52,7 +52,7 @@ class AdaFaceWrapper(nn.Module):
         self.q_lora_updates_query  = q_lora_updates_query
         self.use_lcm = use_lcm
         self.subject_string = subject_string
-        self.shrink_cross_attn = shrink_cross_attn
         self.default_scheduler_name = default_scheduler_name
         self.num_inference_steps = num_inference_steps if not use_lcm else 4
@@ -189,10 +189,10 @@ class AdaFaceWrapper(nn.Module):
             pipeline.unet = unet_ensemble
         print(f"Loaded pipeline from {self.base_model_path}.")
-        if not remove_unet and (self.unet_uses_attn_lora or self.shrink_cross_attn):
             unet2 = self.load_unet_lora_weights(pipeline.unet, use_attn_lora=self.unet_uses_attn_lora,
                                                 attn_lora_layer_names=self.attn_lora_layer_names,
-                                                shrink_cross_attn=self.shrink_cross_attn,
                                                 q_lora_updates_query=self.q_lora_updates_query)
             pipeline.unet = unet2
@@ -294,12 +294,11 @@ class AdaFaceWrapper(nn.Module):
     def load_unet_loras(self, unet, unet_lora_modules_state_dict,
                         use_attn_lora=True, use_ffn_lora=False,
                         attn_lora_layer_names=['q', 'k', 'v', 'out'],
-                        shrink_cross_attn=False, cross_attn_shrink_factor=0.5,
                         q_lora_updates_query=False):
         attn_capture_procs, attn_opt_modules = \
             set_up_attn_processors(unet, use_attn_lora=True, attn_lora_layer_names=attn_lora_layer_names,
                                    lora_rank=192, lora_scale_down=8,
-                                   cross_attn_shrink_factor=cross_attn_shrink_factor,
                                    q_lora_updates_query=q_lora_updates_query)
         # up_blocks.3.resnets.[1~2].conv1, conv2, conv_shortcut. [12] matches 1 or 2.
         if use_ffn_lora:
@@ -343,16 +342,17 @@ class AdaFaceWrapper(nn.Module):
         print(f"Loaded {len(unet_lora_modules_state_dict)} LoRA weights on the UNet:\n{unet_lora_modules.keys()}")
         self.outfeat_capture_blocks.append(unet.up_blocks[3])
-        # If shrink_cross_attn is True and use_attn_lora is False, we load all these params from ckpt,
         # but since we set use_attn_lora to False, attn loras won't be used during inference nonetheless.
         set_lora_and_capture_flags(unet, None, self.attn_capture_procs, self.outfeat_capture_blocks,
                                    use_attn_lora, use_ffn_lora, 'recon_loss', capture_ca_activations=False,
-                                   shrink_cross_attn=shrink_cross_attn)
         return unet
     def load_unet_lora_weights(self, unet, use_attn_lora=True, attn_lora_layer_names=['q', 'k', 'v', 'out'],
-                               shrink_cross_attn=False, q_lora_updates_query=False):
         unet_lora_weight_found = False
         if isinstance(self.adaface_ckpt_paths, str):
             adaface_ckpt_paths = [self.adaface_ckpt_paths]
@@ -360,7 +360,7 @@ class AdaFaceWrapper(nn.Module):
             adaface_ckpt_paths = self.adaface_ckpt_paths
         for adaface_ckpt_path in adaface_ckpt_paths:
-            ckpt_dict = torch.load(adaface_ckpt_path, map_location='cpu')
             if 'unet_lora_modules' in ckpt_dict:
                 unet_lora_modules_state_dict = ckpt_dict['unet_lora_modules']
                 print(f"{len(unet_lora_modules_state_dict)} LoRA weights found in {adaface_ckpt_path}.")
@@ -379,7 +379,7 @@ class AdaFaceWrapper(nn.Module):
                 unet_ = self.load_unet_loras(unet_, unet_lora_modules_state_dict,
                                              use_attn_lora=use_attn_lora,
                                              attn_lora_layer_names=attn_lora_layer_names,
-                                             shrink_cross_attn=shrink_cross_attn,
                                              q_lora_updates_query=q_lora_updates_query)
                 unet.unets[i] = unet_
             print(f"Loaded LoRA processors on UNetEnsemble of {len(unet.unets)} UNets.")
@@ -387,7 +387,7 @@ class AdaFaceWrapper(nn.Module):
             unet = self.load_unet_loras(unet, unet_lora_modules_state_dict,
                                         use_attn_lora=use_attn_lora,
                                         attn_lora_layer_names=attn_lora_layer_names,
-                                        shrink_cross_attn=shrink_cross_attn,
                                         q_lora_updates_query=q_lora_updates_query)
         return unet
@@ -612,8 +612,9 @@ class AdaFaceWrapper(nn.Module):
         # Scan prompt and replace tokens in self.placeholder_token_ids
         # with the corresponding image embeddings.
         prompt_tokens = self.pipeline.tokenizer.tokenize(prompt)
         prompt_embeds2 = prompt_embeds.clone()
-        if alt_prompt_embed_type == 'img':
             if self.img_prompt_embs is None:
                 print("Unable to find img_prompt_embs. Either prepare_adaface_embeddings() hasn't been called, or faceless images were used.")
                 return prompt_embeds
@@ -628,17 +629,18 @@ class AdaFaceWrapper(nn.Module):
             breakpoint()
         repl_tokens = {}
         for i in range(len(prompt_tokens)):
             if prompt_tokens[i] in self.all_placeholder_tokens:
                 encoder_idx = next((i for i, sublist in enumerate(self.encoder_placeholder_tokens) \
                                     if prompt_tokens[i] in sublist), 0)
-                alt_prompt_emb_weight = alt_prompt_emb_weights[encoder_idx]
-                prompt_embeds2[:, i] = prompt_embeds2[:, i] * (1 - alt_prompt_emb_weight) \
                                        + repl_embeddings[:, self.all_placeholder_tokens.index(prompt_tokens[i])] * alt_prompt_emb_weight
                 repl_tokens[prompt_tokens[i]] = 1
         repl_token_count = len(repl_tokens)
-        if np.all(np.array(alt_prompt_emb_weights) == 1):
             print(f"Replaced {repl_token_count} tokens with {alt_prompt_embed_type} embeddings.")
         else:
             print(f"Mixed {repl_token_count} tokens with {alt_prompt_embed_type} embeddings, weight {alt_prompt_emb_weights}.")
@@ -650,7 +652,7 @@ class AdaFaceWrapper(nn.Module):
                       placeholder_tokens_pos='append',
                       ablate_prompt_only_placeholders=False,
                       ablate_prompt_no_placeholders=False,
-                      ablate_prompt_embed_type='ada', # 'ada', 'ada-nonmix', 'img'
                       nonmix_prompt_emb_weight=0,
                       repeat_prompt_for_each_encoder=True,
                       device=None, verbose=False):
@@ -678,14 +680,25 @@ class AdaFaceWrapper(nn.Module):
         prompt_embeds_, negative_prompt_embeds_, pooled_prompt_embeds_, negative_pooled_prompt_embeds_ = \
             self.diffusers_encode_prompts(prompt, plain_prompt, negative_prompt, device)
-        if ablate_prompt_embed_type != 'ada':
             alt_prompt_embed_type = ablate_prompt_embed_type
-            alt_prompt_emb_weights = (1, 1)
         elif nonmix_prompt_emb_weight > 0:
             alt_prompt_embed_type = 'ada-nonmix'
-            alt_prompt_emb_weights = (nonmix_prompt_emb_weight, nonmix_prompt_emb_weight)
         else:
-            alt_prompt_emb_weights = (0, 0)
         if sum(alt_prompt_emb_weights) > 0:
             prompt_embeds_ = self.mix_ada_embs_with_other_embs(prompt, prompt_embeds_,

                  use_840k_vae=False, use_ds_text_encoder=False,
                  main_unet_filepath=None, unet_types=None, extra_unet_dirpaths=None, unet_weights_in_ensemble=None,
                  enable_static_img_suffix_embs=None, unet_uses_attn_lora=False,
+                 attn_lora_layer_names=['q', 'k', 'v', 'out'], normalize_cross_attn=False, q_lora_updates_query=False,
                  device='cuda', is_training=False):
         '''
         pipeline_name: "text2img", "text2imgxl", "img2img", "text2img3", "flux", or None.
         self.q_lora_updates_query  = q_lora_updates_query
         self.use_lcm = use_lcm
         self.subject_string = subject_string
+        self.normalize_cross_attn = normalize_cross_attn
         self.default_scheduler_name = default_scheduler_name
         self.num_inference_steps = num_inference_steps if not use_lcm else 4
             pipeline.unet = unet_ensemble
         print(f"Loaded pipeline from {self.base_model_path}.")
+        if not remove_unet and (self.unet_uses_attn_lora or self.normalize_cross_attn):
             unet2 = self.load_unet_lora_weights(pipeline.unet, use_attn_lora=self.unet_uses_attn_lora,
                                                 attn_lora_layer_names=self.attn_lora_layer_names,
+                                                normalize_cross_attn=self.normalize_cross_attn,
                                                 q_lora_updates_query=self.q_lora_updates_query)
             pipeline.unet = unet2
     def load_unet_loras(self, unet, unet_lora_modules_state_dict,
                         use_attn_lora=True, use_ffn_lora=False,
                         attn_lora_layer_names=['q', 'k', 'v', 'out'],
+                        normalize_cross_attn=False,
                         q_lora_updates_query=False):
         attn_capture_procs, attn_opt_modules = \
             set_up_attn_processors(unet, use_attn_lora=True, attn_lora_layer_names=attn_lora_layer_names,
                                    lora_rank=192, lora_scale_down=8,
                                    q_lora_updates_query=q_lora_updates_query)
         # up_blocks.3.resnets.[1~2].conv1, conv2, conv_shortcut. [12] matches 1 or 2.
         if use_ffn_lora:
         print(f"Loaded {len(unet_lora_modules_state_dict)} LoRA weights on the UNet:\n{unet_lora_modules.keys()}")
         self.outfeat_capture_blocks.append(unet.up_blocks[3])
+        # If normalize_cross_attn is True and use_attn_lora is False, we load all these params from ckpt,
         # but since we set use_attn_lora to False, attn loras won't be used during inference nonetheless.
         set_lora_and_capture_flags(unet, None, self.attn_capture_procs, self.outfeat_capture_blocks,
                                    use_attn_lora, use_ffn_lora, 'recon_loss', capture_ca_activations=False,
+                                   normalize_cross_attn=normalize_cross_attn, mix_attn_mats_in_batch=False,
+                                   res_hidden_states_gradscale=0)
         return unet
     def load_unet_lora_weights(self, unet, use_attn_lora=True, attn_lora_layer_names=['q', 'k', 'v', 'out'],
+                               normalize_cross_attn=False, q_lora_updates_query=False):
         unet_lora_weight_found = False
         if isinstance(self.adaface_ckpt_paths, str):
             adaface_ckpt_paths = [self.adaface_ckpt_paths]
             adaface_ckpt_paths = self.adaface_ckpt_paths
         for adaface_ckpt_path in adaface_ckpt_paths:
+            ckpt_dict = torch.load(adaface_ckpt_path, map_location='cpu', weights_only=False)
             if 'unet_lora_modules' in ckpt_dict:
                 unet_lora_modules_state_dict = ckpt_dict['unet_lora_modules']
                 print(f"{len(unet_lora_modules_state_dict)} LoRA weights found in {adaface_ckpt_path}.")
                 unet_ = self.load_unet_loras(unet_, unet_lora_modules_state_dict,
                                              use_attn_lora=use_attn_lora,
                                              attn_lora_layer_names=attn_lora_layer_names,
+                                             normalize_cross_attn=normalize_cross_attn,
                                              q_lora_updates_query=q_lora_updates_query)
                 unet.unets[i] = unet_
             print(f"Loaded LoRA processors on UNetEnsemble of {len(unet.unets)} UNets.")
             unet = self.load_unet_loras(unet, unet_lora_modules_state_dict,
                                         use_attn_lora=use_attn_lora,
                                         attn_lora_layer_names=attn_lora_layer_names,
+                                        normalize_cross_attn=normalize_cross_attn,
                                         q_lora_updates_query=q_lora_updates_query)
         return unet
         # Scan prompt and replace tokens in self.placeholder_token_ids
         # with the corresponding image embeddings.
         prompt_tokens = self.pipeline.tokenizer.tokenize(prompt)
+        # prompt_embeds are the ada embeddings.
         prompt_embeds2 = prompt_embeds.clone()
+        if alt_prompt_embed_type.startswith('img'):
             if self.img_prompt_embs is None:
                 print("Unable to find img_prompt_embs. Either prepare_adaface_embeddings() hasn't been called, or faceless images were used.")
                 return prompt_embeds
             breakpoint()
         repl_tokens = {}
+        ada_emb_weight = alt_prompt_emb_weights[0]
         for i in range(len(prompt_tokens)):
             if prompt_tokens[i] in self.all_placeholder_tokens:
                 encoder_idx = next((i for i, sublist in enumerate(self.encoder_placeholder_tokens) \
                                     if prompt_tokens[i] in sublist), 0)
+                alt_prompt_emb_weight = alt_prompt_emb_weights[encoder_idx + 1]
+                prompt_embeds2[:, i] = prompt_embeds2[:, i] * ada_emb_weight \
                                        + repl_embeddings[:, self.all_placeholder_tokens.index(prompt_tokens[i])] * alt_prompt_emb_weight
                 repl_tokens[prompt_tokens[i]] = 1
         repl_token_count = len(repl_tokens)
+        if ada_emb_weight == 0:
             print(f"Replaced {repl_token_count} tokens with {alt_prompt_embed_type} embeddings.")
         else:
             print(f"Mixed {repl_token_count} tokens with {alt_prompt_embed_type} embeddings, weight {alt_prompt_emb_weights}.")
                       placeholder_tokens_pos='append',
                       ablate_prompt_only_placeholders=False,
                       ablate_prompt_no_placeholders=False,
+                      ablate_prompt_embed_type='ada', # 'ada', 'ada-nonmix', 'img', 'img1', 'img2'.
                       nonmix_prompt_emb_weight=0,
                       repeat_prompt_for_each_encoder=True,
                       device=None, verbose=False):
         prompt_embeds_, negative_prompt_embeds_, pooled_prompt_embeds_, negative_pooled_prompt_embeds_ = \
             self.diffusers_encode_prompts(prompt, plain_prompt, negative_prompt, device)
+        if ablate_prompt_embed_type.startswith('img'):
             alt_prompt_embed_type = ablate_prompt_embed_type
+            if alt_prompt_embed_type == 'img1':
+                # The mixing weights of ada, img1, and img2 are 0, 1, and 0.
+                alt_prompt_emb_weights = (0, 1, 0)
+            elif alt_prompt_embed_type == 'img2':
+                # The mixing weights of ada, img1, and img2 are 0, 0, and 1.
+                alt_prompt_emb_weights = (0, 0, 1)
+            else:
+                # The mixing weights of ada, img1, and img2 are 0, 1, and 1.
+                alt_prompt_emb_weights = (0, 1, 1)
         elif nonmix_prompt_emb_weight > 0:
             alt_prompt_embed_type = 'ada-nonmix'
+            # The mixing weight of ada is 1 - nonmix_prompt_emb_weight, instead of 1 - nonmix_prompt_emb_weight * 2.
+            # It means ada is mixed by this weight with both img1 and img2.
+            alt_prompt_emb_weights = (1 - nonmix_prompt_emb_weight, nonmix_prompt_emb_weight, nonmix_prompt_emb_weight)
         else:
+            # Don't change the prompt embeddings. So we set all the mixing weights to 0.
+            alt_prompt_emb_weights = (0, 0, 0)
         if sum(alt_prompt_emb_weights) > 0:
             prompt_embeds_ = self.mix_ada_embs_with_other_embs(prompt, prompt_embeds_,

adaface/diffusers_attn_lora_capture.py CHANGED Viewed

@@ -4,7 +4,6 @@ import torch.nn.functional as F
 from typing import Optional, Tuple, Dict, Any
 from diffusers.models.attention_processor import Attention, AttnProcessor2_0
 from diffusers.utils import logging, is_torch_version, deprecate
-from diffusers.utils.torch_utils import fourier_filter
 # UNet is a diffusers PeftAdapterMixin instance.
 from diffusers.loaders.peft import PeftAdapterMixin
 from peft import LoraConfig, get_peft_model
@@ -12,7 +11,6 @@ import peft.tuners.lora as peft_lora
 from peft.tuners.lora.dora import DoraLinearLayer
 from einops import rearrange
 import math, re
-import numpy as np
 from peft.tuners.tuners_utils import BaseTunerLayer
@@ -28,7 +26,7 @@ class ScaleGrad(torch.autograd.Function):
         ctx.save_for_backward(alpha_, debug)
         output = input_
         if debug:
-            print(f"input: {input_.abs().mean().item()}")
         return output
     @staticmethod
@@ -38,7 +36,7 @@ class ScaleGrad(torch.autograd.Function):
         if ctx.needs_input_grad[0]:
             grad_output2 = grad_output * alpha_
             if debug:
-                print(f"grad_output2: {grad_output2.abs().mean().item()}")
         else:
             grad_output2 = None
         return grad_output2, None, None
@@ -77,36 +75,11 @@ def split_indices_by_instance(indices, as_dict=False):
         indices_by_instance = { uib.item(): indices_N[indices_B == uib] for uib in unique_indices_B }
     return indices_by_instance
-# If do_sum, returned emb_attns is 3D. Otherwise 4D.
-# indices are applied on the first 2 dims of attn_mat.
-def sel_emb_attns_by_indices(attn_mat, indices, all_token_weights=None, do_sum=True, do_mean=False):
-    indices_by_instance = split_indices_by_instance(indices)
-    # emb_attns[0]: [1, 9, 8, 64]
-    # 8: 8 attention heads. Last dim 64: number of image tokens.
-    emb_attns   = [ attn_mat[inst_indices].unsqueeze(0) for inst_indices in indices_by_instance ]
-    if all_token_weights is not None:
-        # all_token_weights: [4, 77].
-        # token_weights_by_instance[0]: [1, 9, 1, 1].
-        token_weights = [ all_token_weights[inst_indices].reshape(1, -1, 1, 1) for inst_indices in indices_by_instance ]
-    else:
-        token_weights = [ 1 ] * len(indices_by_instance)
-    # Apply token weights.
-    emb_attns = [ emb_attns[i] * token_weights[i] for i in range(len(indices_by_instance)) ]
-    # sum among K_subj_i subj embeddings -> [1, 8, 64]
-    if do_sum:
-        emb_attns   = [ emb_attns[i].sum(dim=1) for i in range(len(indices_by_instance)) ]
-    elif do_mean:
-        emb_attns   = [ emb_attns[i].mean(dim=1) for i in range(len(indices_by_instance)) ]
-    emb_attns = torch.cat(emb_attns, dim=0)
-    return emb_attns
 # Slow implementation equivalent to F.scaled_dot_product_attention.
-def scaled_dot_product_attention(query, key, value, attn_mask=None, dropout_p=0.0,
-                                 shrink_cross_attn=False, cross_attn_shrink_factor=0.5,
                                  is_causal=False, scale=None, enable_gqa=False) -> torch.Tensor:
     B, L, S = query.size(0), query.size(-2), key.size(-2)
     scale_factor = 1 / math.sqrt(query.size(-1)) if scale is None else scale
@@ -128,21 +101,39 @@ def scaled_dot_product_attention(query, key, value, attn_mask=None, dropout_p=0.
         key = key.repeat_interleave(query.size(-3)//key.size(-3), -3)
         value = value.repeat_interleave(query.size(-3)//value.size(-3), -3)
-    attn_weight = query @ key.transpose(-2, -1) * scale_factor
-    if shrink_cross_attn:
-        cross_attn_scale = cross_attn_shrink_factor
-    else:
-        cross_attn_scale = 1
-    # attn_bias: [1, 1, 4096, 77], the same size as a single-head attn_weight.
-    attn_weight += attn_bias
-    attn_score = attn_weight
-    attn_weight = torch.softmax(attn_weight, dim=-1)
-    # NOTE: After scaling, the "probabilities" of the subject embeddings will sum to < 1.
-    # But this is intended, as we want to scale down the impact of the subject embeddings
-    # in the computed attention output tensors.
-    attn_weight = attn_weight * cross_attn_scale
     attn_weight = torch.dropout(attn_weight, dropout_p, train=True)
     output = attn_weight @ value
     return output, attn_score, attn_weight
@@ -156,23 +147,25 @@ class AttnProcessor_LoRA_Capture(nn.Module):
     def __init__(self, capture_ca_activations: bool = False, enable_lora: bool = False,
                  lora_uses_dora=True, lora_proj_layers=None,
                  lora_rank: int = 192, lora_alpha: float = 16,
-                 cross_attn_shrink_factor: float = 0.5,
                  q_lora_updates_query=False, attn_proc_idx=-1):
         super().__init__()
         self.global_enable_lora = enable_lora
         self.attn_proc_idx = attn_proc_idx
         # reset_attn_cache_and_flags() sets the local (call-specific) self.enable_lora flag.
-        # By default, shrink_cross_attn is False. Later in layers 22, 23, 24 it will be set to True.
-        self.reset_attn_cache_and_flags(capture_ca_activations, False, enable_lora)
         self.lora_rank = lora_rank
         self.lora_alpha = lora_alpha
         self.lora_scale = self.lora_alpha / self.lora_rank
-        self.cross_attn_shrink_factor = cross_attn_shrink_factor
         self.q_lora_updates_query = q_lora_updates_query
         self.to_q_lora = self.to_k_lora = self.to_v_lora = self.to_out_lora = None
         if self.global_enable_lora:
             for lora_layer_name, lora_proj_layer in lora_proj_layers.items():
                 if lora_layer_name == 'q':
                     self.to_q_lora   = peft_lora.Linear(lora_proj_layer, 'default', r=lora_rank, lora_alpha=lora_alpha,
@@ -188,9 +181,10 @@ class AttnProcessor_LoRA_Capture(nn.Module):
                                                         use_dora=lora_uses_dora, lora_dropout=0.1)
     # LoRA layers can be enabled/disabled dynamically.
-    def reset_attn_cache_and_flags(self, capture_ca_activations, shrink_cross_attn, enable_lora):
         self.capture_ca_activations = capture_ca_activations
-        self.shrink_cross_attn      = shrink_cross_attn
         self.cached_activations     = {}
         # Only enable LoRA for the next call(s) if global_enable_lora is set to True.
         self.enable_lora = enable_lora and self.global_enable_lora
@@ -312,11 +306,14 @@ class AttnProcessor_LoRA_Capture(nn.Module):
             breakpoint()
         # the output of sdp = (batch, num_heads, seq_len, head_dim)
-        if is_cross_attn and (self.capture_ca_activations or self.shrink_cross_attn):
             hidden_states, attn_score, attn_prob = \
                 scaled_dot_product_attention(query, key, value, attn_mask=attention_mask,
-                                             dropout_p=0.0, shrink_cross_attn=self.shrink_cross_attn,
-                                             cross_attn_shrink_factor=self.cross_attn_shrink_factor)
         else:
             # Use the faster implementation of scaled_dot_product_attention
             # when not capturing the activations or suppressing the subject attention.
@@ -452,7 +449,7 @@ def CrossAttnUpBlock2D_forward_capture(
 # Adapted from ConsistentIDPipeline:set_ip_adapter().
 # attn_lora_layer_names: candidates are subsets of ['q', 'k', 'v', 'out'].
 def set_up_attn_processors(unet, use_attn_lora, attn_lora_layer_names=['q', 'k', 'v', 'out'],
-                           lora_rank=192, lora_scale_down=8, cross_attn_shrink_factor=0.5,
                            q_lora_updates_query=False):
     attn_procs = {}
     attn_capture_procs = {}
@@ -502,7 +499,6 @@ def set_up_attn_processors(unet, use_attn_lora, attn_lora_layer_names=['q', 'k',
             lora_uses_dora=True, lora_proj_layers=lora_proj_layers,
             # LoRA up is initialized to 0. So no need to worry that the LoRA output may be too large.
             lora_rank=lora_rank, lora_alpha=lora_rank // lora_scale_down,
-            cross_attn_shrink_factor=cross_attn_shrink_factor,
             q_lora_updates_query=q_lora_updates_query, attn_proc_idx=attn_proc_idx)
         attn_proc_idx += 1
@@ -513,6 +509,11 @@ def set_up_attn_processors(unet, use_attn_lora, attn_lora_layer_names=['q', 'k',
         attn_capture_procs[name] = attn_capture_proc
         if use_attn_lora:
             for subname, module in attn_capture_proc.named_modules():
                 if isinstance(module, peft_lora.LoraLayer):
                     # ModuleDict doesn't allow "." in the key.
@@ -537,7 +538,7 @@ def set_up_attn_processors(unet, use_attn_lora, attn_lora_layer_names=['q', 'k',
     return attn_capture_procs, attn_opt_modules
 # NOTE: cross-attn layers are included in the returned lora_modules.
-def set_up_ffn_loras(unet, target_modules_pat, lora_uses_dora=False, lora_rank=192, lora_alpha=16):
     # target_modules_pat = 'up_blocks.3.resnets.[12].conv[a-z0-9_]+'
     # up_blocks.3.resnets.[1~2].conv1, conv2, conv_shortcut
     # Cannot set to conv.+ as it will match added adapter module names, including
@@ -592,15 +593,18 @@ def set_up_ffn_loras(unet, target_modules_pat, lora_uses_dora=False, lora_rank=1
 def set_lora_and_capture_flags(unet, unet_lora_modules, attn_capture_procs,
                                outfeat_capture_blocks, res_hidden_states_gradscale_blocks,
                                use_attn_lora, use_ffn_lora, ffn_lora_adapter_name, capture_ca_activations,
-                               shrink_cross_attn, res_hidden_states_gradscale):
     # For attn capture procs, capture_ca_activations and use_attn_lora are set in reset_attn_cache_and_flags().
-    for attn_capture_proc in attn_capture_procs:
-        attn_capture_proc.reset_attn_cache_and_flags(capture_ca_activations, shrink_cross_attn, enable_lora=use_attn_lora)
     # outfeat_capture_blocks only contains the last up block, up_blocks[3].
     # It contains 3 FFN layers. We want to capture their output features.
     for block in outfeat_capture_blocks:
         block.capture_outfeats           = capture_ca_activations
     for block in res_hidden_states_gradscale_blocks:
         block.res_hidden_states_gradscale = res_hidden_states_gradscale
@@ -639,6 +643,7 @@ def get_captured_activations(capture_ca_activations, attn_capture_procs, outfeat
         block.cached_outfeats = {}
         block.capture_outfeats = False
     for layer_idx in captured_layer_indices:
         # Subtract 22 to ca_layer_idx to match the layer index in up_blocks[3].cached_outfeats.
         # 23, 24 -> 1, 2 (!! not 0, 1 !!)

 from typing import Optional, Tuple, Dict, Any
 from diffusers.models.attention_processor import Attention, AttnProcessor2_0
 from diffusers.utils import logging, is_torch_version, deprecate
 # UNet is a diffusers PeftAdapterMixin instance.
 from diffusers.loaders.peft import PeftAdapterMixin
 from peft import LoraConfig, get_peft_model
 from peft.tuners.lora.dora import DoraLinearLayer
 from einops import rearrange
 import math, re
 from peft.tuners.tuners_utils import BaseTunerLayer
         ctx.save_for_backward(alpha_, debug)
         output = input_
         if debug:
+            print(f"input: {input_.abs().mean().detach().item()}")
         return output
     @staticmethod
         if ctx.needs_input_grad[0]:
             grad_output2 = grad_output * alpha_
             if debug:
+                print(f"grad_output2: {grad_output2.abs().mean().detach().item()}")
         else:
             grad_output2 = None
         return grad_output2, None, None
         indices_by_instance = { uib.item(): indices_N[indices_B == uib] for uib in unique_indices_B }
     return indices_by_instance
 # Slow implementation equivalent to F.scaled_dot_product_attention.
+def scaled_dot_product_attention(query, key, value, cross_attn_scale_factor,
+                                 attn_mask=None, dropout_p=0.0,
+                                 subj_indices=None, normalize_cross_attn=False,
+                                 mix_attn_mats_in_batch=False,
                                  is_causal=False, scale=None, enable_gqa=False) -> torch.Tensor:
     B, L, S = query.size(0), query.size(-2), key.size(-2)
     scale_factor = 1 / math.sqrt(query.size(-1)) if scale is None else scale
         key = key.repeat_interleave(query.size(-3)//key.size(-3), -3)
         value = value.repeat_interleave(query.size(-3)//value.size(-3), -3)
+    attn_score = query @ key.transpose(-2, -1) * scale_factor
+    # attn_bias: [1, 1, 4096, 77], the same size as a single-head attn_score.
+    attn_score += attn_bias
+    if mix_attn_mats_in_batch:
+        # The instances in the batch are [sc, mc]. We average their attn scores,
+        # and apply to both instances.
+        # attn_score: [2, 8, 4096, 77] -> [1, 8, 4096, 77] -> [2, 8, 4096, 77].
+        # If BLOCK_SIZE > 1, attn_score.shape[0] = 2 * BLOCK_SIZE.
+        if attn_score.shape[0] %2 != 0:
+            breakpoint()
+        attn_score_sc, attn_score_mc = attn_score.chunk(2, dim=0)
+        # Cut off the grad flow from the SC instance to the MC instance.
+        attn_score = (attn_score_sc + attn_score_mc.detach()) / 2
+        attn_score = attn_score.repeat(2, 1, 1, 1)
+    elif normalize_cross_attn:
+        if subj_indices is None:
+            breakpoint()
+        subj_indices_B, subj_indices_N = subj_indices
+        subj_attn_score = attn_score[subj_indices_B, :, :, subj_indices_N]
+        # Normalize the attention score of the subject tokens to have mean 0 across tokens,
+        # so that positive and negative scores are balanced.
+        subj_attn_score = subj_attn_score - subj_attn_score.mean(dim=2, keepdim=True).detach()
+        # cross_attn_scale is a learnable parameter, so the score will be scaled appropriately.
+        # Scale up the BP'ed gradient to cross_attn_scale_factor by 10x.
+        ca_scale_grad_scaler = gen_gradient_scaler(10)
+        subj_attn_score = subj_attn_score * ca_scale_grad_scaler(cross_attn_scale_factor)
+        attn_score2 = attn_score.clone()
+        attn_score2[subj_indices_B, :, :, subj_indices_N] = subj_attn_score
+        attn_score = attn_score2
+    # Otherwise, do nothing to attn_score.
+    attn_weight = torch.softmax(attn_score, dim=-1)
     attn_weight = torch.dropout(attn_weight, dropout_p, train=True)
     output = attn_weight @ value
     return output, attn_score, attn_weight
     def __init__(self, capture_ca_activations: bool = False, enable_lora: bool = False,
                  lora_uses_dora=True, lora_proj_layers=None,
                  lora_rank: int = 192, lora_alpha: float = 16,
                  q_lora_updates_query=False, attn_proc_idx=-1):
         super().__init__()
         self.global_enable_lora = enable_lora
         self.attn_proc_idx = attn_proc_idx
         # reset_attn_cache_and_flags() sets the local (call-specific) self.enable_lora flag.
+        # By default, normalize_cross_attn is False. Later in layers 22, 23, 24 it will be set to True.
+        self.reset_attn_cache_and_flags(capture_ca_activations, False, False, enable_lora)
         self.lora_rank = lora_rank
         self.lora_alpha = lora_alpha
         self.lora_scale = self.lora_alpha / self.lora_rank
         self.q_lora_updates_query = q_lora_updates_query
         self.to_q_lora = self.to_k_lora = self.to_v_lora = self.to_out_lora = None
         if self.global_enable_lora:
+            # enable_lora = True iff this is a cross-attn layer in the last 3 up blocks.
+            # Since we only use cross_attn_scale_factor on cross-attn layers,
+            # we only use cross_attn_scale_factor when enable_lora is True.
+            self.cross_attn_scale_factor = nn.Parameter(torch.tensor(0.8), requires_grad=True)
             for lora_layer_name, lora_proj_layer in lora_proj_layers.items():
                 if lora_layer_name == 'q':
                     self.to_q_lora   = peft_lora.Linear(lora_proj_layer, 'default', r=lora_rank, lora_alpha=lora_alpha,
                                                         use_dora=lora_uses_dora, lora_dropout=0.1)
     # LoRA layers can be enabled/disabled dynamically.
+    def reset_attn_cache_and_flags(self, capture_ca_activations, normalize_cross_attn, mix_attn_mats_in_batch, enable_lora):
         self.capture_ca_activations = capture_ca_activations
+        self.normalize_cross_attn      = normalize_cross_attn
+        self.mix_attn_mats_in_batch = mix_attn_mats_in_batch
         self.cached_activations     = {}
         # Only enable LoRA for the next call(s) if global_enable_lora is set to True.
         self.enable_lora = enable_lora and self.global_enable_lora
             breakpoint()
         # the output of sdp = (batch, num_heads, seq_len, head_dim)
+        if is_cross_attn and (self.capture_ca_activations or self.normalize_cross_attn):
             hidden_states, attn_score, attn_prob = \
                 scaled_dot_product_attention(query, key, value, attn_mask=attention_mask,
+                                             dropout_p=0.0, subj_indices=subj_indices,
+                                             normalize_cross_attn=self.normalize_cross_attn,
+                                             cross_attn_scale_factor=self.cross_attn_scale_factor,
+                                             mix_attn_mats_in_batch=self.mix_attn_mats_in_batch)
         else:
             # Use the faster implementation of scaled_dot_product_attention
             # when not capturing the activations or suppressing the subject attention.
 # Adapted from ConsistentIDPipeline:set_ip_adapter().
 # attn_lora_layer_names: candidates are subsets of ['q', 'k', 'v', 'out'].
 def set_up_attn_processors(unet, use_attn_lora, attn_lora_layer_names=['q', 'k', 'v', 'out'],
+                           lora_rank=192, lora_scale_down=8,
                            q_lora_updates_query=False):
     attn_procs = {}
     attn_capture_procs = {}
             lora_uses_dora=True, lora_proj_layers=lora_proj_layers,
             # LoRA up is initialized to 0. So no need to worry that the LoRA output may be too large.
             lora_rank=lora_rank, lora_alpha=lora_rank // lora_scale_down,
             q_lora_updates_query=q_lora_updates_query, attn_proc_idx=attn_proc_idx)
         attn_proc_idx += 1
         attn_capture_procs[name] = attn_capture_proc
         if use_attn_lora:
+            cross_attn_scale_factor_name = name + "_cross_attn_scale_factor"
+            # Put cross_attn_scale_factor in attn_opt_modules, so that we can optimize and save/load it.
+            attn_opt_modules[cross_attn_scale_factor_name] = attn_capture_proc.cross_attn_scale_factor
+            # Put LoRA layers in attn_opt_modules, so that we can optimize and save/load them.
             for subname, module in attn_capture_proc.named_modules():
                 if isinstance(module, peft_lora.LoraLayer):
                     # ModuleDict doesn't allow "." in the key.
     return attn_capture_procs, attn_opt_modules
 # NOTE: cross-attn layers are included in the returned lora_modules.
+def set_up_ffn_loras(unet, target_modules_pat, lora_uses_dora=True, lora_rank=192, lora_alpha=16):
     # target_modules_pat = 'up_blocks.3.resnets.[12].conv[a-z0-9_]+'
     # up_blocks.3.resnets.[1~2].conv1, conv2, conv_shortcut
     # Cannot set to conv.+ as it will match added adapter module names, including
 def set_lora_and_capture_flags(unet, unet_lora_modules, attn_capture_procs,
                                outfeat_capture_blocks, res_hidden_states_gradscale_blocks,
                                use_attn_lora, use_ffn_lora, ffn_lora_adapter_name, capture_ca_activations,
+                               normalize_cross_attn, mix_attn_mats_in_batch, res_hidden_states_gradscale):
     # For attn capture procs, capture_ca_activations and use_attn_lora are set in reset_attn_cache_and_flags().
+    for i, attn_capture_proc in enumerate(attn_capture_procs):
+        attn_capture_proc.reset_attn_cache_and_flags(capture_ca_activations, normalize_cross_attn, mix_attn_mats_in_batch,
+                                                     enable_lora=use_attn_lora)
     # outfeat_capture_blocks only contains the last up block, up_blocks[3].
     # It contains 3 FFN layers. We want to capture their output features.
     for block in outfeat_capture_blocks:
         block.capture_outfeats           = capture_ca_activations
+    # res_hidden_states_gradscale_blocks contain the second to the last up blocks, up_blocks[1:].
+    # It's only used to set res_hidden_states_gradscale, and doesn't capture anything.
     for block in res_hidden_states_gradscale_blocks:
         block.res_hidden_states_gradscale = res_hidden_states_gradscale
         block.cached_outfeats = {}
         block.capture_outfeats = False
     for layer_idx in captured_layer_indices:
         # Subtract 22 to ca_layer_idx to match the layer index in up_blocks[3].cached_outfeats.
         # 23, 24 -> 1, 2 (!! not 0, 1 !!)

adaface/face_id_to_ada_prompt.py CHANGED Viewed

@@ -603,9 +603,13 @@ class Arc2Face_ID2AdaPrompt(FaceID2AdaPrompt):
         '''
         # Use the same model as ID2AdaPrompt does.
         # FaceAnalysis will try to find the ckpt in: models/insightface/models/antelopev2.
-        # Note there are two "models" in the path.
         self.face_app = FaceAnalysis(name='antelopev2', root='models/insightface',
-                                            providers=['CPUExecutionProvider'])
         self.face_app.prepare(ctx_id=0, det_size=(512, 512))
         print(f'Arc2Face Face encoder loaded on CPU.')
@@ -642,7 +646,6 @@ class Arc2Face_ID2AdaPrompt(FaceID2AdaPrompt):
     def _apply(self, fn):
         super()._apply(fn)  # Call the parent _apply to handle parameters and buffers
-        return
         # A dirty hack to get the device of the model, passed from
         # parent.model.to(self.root_device) => parent._apply(convert) => module._apply(fn)
         test_tensor = torch.zeros(1)  # Create a test tensor
@@ -654,16 +657,14 @@ class Arc2Face_ID2AdaPrompt(FaceID2AdaPrompt):
         if str(device) == 'cpu':
             self.face_app = FaceAnalysis(name='antelopev2', root='models/insightface',
-                                        providers=['CPUExecutionProvider'])
             self.face_app.prepare(ctx_id=0, det_size=(512, 512))
         else:
             device_id = device.index
             self.face_app = FaceAnalysis(name='antelopev2', root='models/insightface',
                                          providers=['CUDAExecutionProvider'],
-                                         provider_options=[{"device_id": device_id,
-                                                            "cudnn_conv_algo_search": "HEURISTIC",
-                                                            "gpu_mem_limit": 2 * 1024**3
-                                                            }])
             self.face_app.prepare(ctx_id=device_id, det_size=(512, 512))
         self.device = device
@@ -739,8 +740,8 @@ class ConsistentID_ID2AdaPrompt(FaceID2AdaPrompt):
             # but diffusers will call .to(dtype) in .from_single_file(),
             # and at that moment, the consistentID specific modules are not loaded yet.
             pipe = ConsistentIDPipeline.from_single_file(base_model_path)
-            pipe.load_ConsistentID_model(consistentID_weight_path="./models/ConsistentID/ConsistentID-v1.bin",
-                                         bise_net_weight_path="./models/ConsistentID/BiSeNet_pretrained_for_ConsistentID.pth")
             pipe.to(dtype=self.dtype)
             # Since the passed-in pipe is None, this should be called during inference,
             # when the teacher ConsistentIDPipeline is not initialized.
@@ -791,7 +792,6 @@ class ConsistentID_ID2AdaPrompt(FaceID2AdaPrompt):
     def _apply(self, fn):
         super()._apply(fn)  # Call the parent _apply to handle parameters and buffers
-        return
         # A dirty hack to get the device of the model, passed from
         # parent.model.to(self.root_device) => parent._apply(convert) => module._apply(fn)
         test_tensor = torch.zeros(1)  # Create a test tensor
@@ -809,10 +809,8 @@ class ConsistentID_ID2AdaPrompt(FaceID2AdaPrompt):
             device_id = device.index
             self.face_app = FaceAnalysis(name='buffalo_l', root='models/insightface',
                                          providers=['CUDAExecutionProvider'],
-                                         provider_options=[{"device_id": device_id,
-                                                            "cudnn_conv_algo_search": "HEURISTIC",
-                                                            "gpu_mem_limit": 2 * 1024**3
-                                                            }])
             self.face_app.prepare(ctx_id=device_id, det_size=(512, 512))
         self.device = device
@@ -1277,7 +1275,7 @@ class Joint_FaceID2AdaPrompt(FaceID2AdaPrompt):
         # No faces are found in the images, so return None embeddings.
         # We don't want to return an all-zero embedding, which is useless.
         if num_available_id_vecs == 0:
-            return None, [0]
         # If id2ada_prompt_encoders are ["arc2face", "consistentID"], then
         # during inference, we average across the batch dim.

         '''
         # Use the same model as ID2AdaPrompt does.
         # FaceAnalysis will try to find the ckpt in: models/insightface/models/antelopev2.
+        # Note there's a second "model" in the path.
+        # Note DO use CUDAExecutionProvider during training and CPUExecutionProvider during inference.
+        # Otherwise, CPUExecutionProvider will hang DDP training,
+        # and CUDAExecutionProvider will cause OOM on huggingface spaces.
+        self.onnx_providers = ['CUDAExecutionProvider'] if self.is_training else ['CPUExecutionProvider']
         self.face_app = FaceAnalysis(name='antelopev2', root='models/insightface',
+                                     providers=self.onnx_providers)
         self.face_app.prepare(ctx_id=0, det_size=(512, 512))
         print(f'Arc2Face Face encoder loaded on CPU.')
     def _apply(self, fn):
         super()._apply(fn)  # Call the parent _apply to handle parameters and buffers
         # A dirty hack to get the device of the model, passed from
         # parent.model.to(self.root_device) => parent._apply(convert) => module._apply(fn)
         test_tensor = torch.zeros(1)  # Create a test tensor
         if str(device) == 'cpu':
             self.face_app = FaceAnalysis(name='antelopev2', root='models/insightface',
+                                         providers=['CPUExecutionProvider'])
             self.face_app.prepare(ctx_id=0, det_size=(512, 512))
         else:
             device_id = device.index
             self.face_app = FaceAnalysis(name='antelopev2', root='models/insightface',
                                          providers=['CUDAExecutionProvider'],
+                                         provider_options=[{'device_id': device_id,
+                                                            'cudnn_conv_algo_search': 'HEURISTIC'}])
             self.face_app.prepare(ctx_id=device_id, det_size=(512, 512))
         self.device = device
             # but diffusers will call .to(dtype) in .from_single_file(),
             # and at that moment, the consistentID specific modules are not loaded yet.
             pipe = ConsistentIDPipeline.from_single_file(base_model_path)
+            pipe.load_ConsistentID_model(consistentID_weight_path="models/ConsistentID/ConsistentID-v1.bin",
+                                         bise_net_weight_path="models/ConsistentID/BiSeNet_pretrained_for_ConsistentID.pth")
             pipe.to(dtype=self.dtype)
             # Since the passed-in pipe is None, this should be called during inference,
             # when the teacher ConsistentIDPipeline is not initialized.
     def _apply(self, fn):
         super()._apply(fn)  # Call the parent _apply to handle parameters and buffers
         # A dirty hack to get the device of the model, passed from
         # parent.model.to(self.root_device) => parent._apply(convert) => module._apply(fn)
         test_tensor = torch.zeros(1)  # Create a test tensor
             device_id = device.index
             self.face_app = FaceAnalysis(name='buffalo_l', root='models/insightface',
                                          providers=['CUDAExecutionProvider'],
+                                         provider_options=[{'device_id': device_id,
+                                                            'cudnn_conv_algo_search': 'HEURISTIC'}])
             self.face_app.prepare(ctx_id=device_id, det_size=(512, 512))
         self.device = device
         # No faces are found in the images, so return None embeddings.
         # We don't want to return an all-zero embedding, which is useless.
         if num_available_id_vecs == 0:
+            return None, None, [0]
         # If id2ada_prompt_encoders are ["arc2face", "consistentID"], then
         # during inference, we average across the batch dim.

adaface/unet_teachers.py CHANGED Viewed

@@ -62,46 +62,41 @@ class UNetTeacher(nn.Module):
     # t: the initial t. We will sample additional (num_denoising_steps - 1) smaller t.
     # same_t_noise_across_instances: when sampling t and noise, use the same t and noise for all instances.
     def forward(self, ddpm_model, x_start, noise, t, teacher_context, negative_context=None,
-                num_denoising_steps=1, same_t_noise_across_instances=False,
                 global_t_lb=0, global_t_ub=1000):
         assert num_denoising_steps <= 10
-        if self.p_uses_cfg > 0:
             self.uses_cfg = np.random.rand() < self.p_uses_cfg
-            if self.uses_cfg:
-                # Randomly sample a cfg_scale from cfg_scale_range.
-                self.cfg_scale = np.random.uniform(*self.cfg_scale_range)
-                if self.cfg_scale == 1:
-                    self.uses_cfg = False
-            if self.uses_cfg:
-                print(f"Teacher samples CFG scale {self.cfg_scale:.1f}.")
-                if negative_context is not None:
-                    negative_context = negative_context[:1].repeat(x_start.shape[0], 1, 1)
-                # if negative_context is None, then teacher_context is a combination of
-                # (one or multiple if unet_ensemble) pos_context and neg_context.
-                # If negative_context is not None, then teacher_context is only pos_context.
-            else:
-                self.cfg_scale = 1
-                print("Teacher does not use CFG.")
-                # If negative_context is None, then teacher_context is a combination of
-                # (one or multiple if unet_ensemble) pos_context and neg_context.
-                # Since not uses_cfg, we only need pos_context.
-                # If negative_context is not None, then teacher_context is only pos_context.
-                if negative_context is None:
-                    teacher_context = self.extract_pos_context(teacher_context, x_start.shape[0])
         else:
             # p_uses_cfg = 0. Never use CFG.
             self.uses_cfg = False
-            # In this case, the student only passes pos_context to the teacher,
-            # so no need to split teacher_context into pos_context and neg_context.
-            # self.cfg_scale will be accessed by the student,
-            # so we need to make sure it is always set correctly,
-            # in case someday we want to switch from CFG to non-CFG during runtime.
             self.cfg_scale = 1
         is_context_doubled = 2 if (self.uses_cfg and negative_context is None) else 1
         if self.name == 'unet_ensemble':
             # teacher_context is a list of teacher contexts.
@@ -199,14 +194,20 @@ class UNetTeacher(nn.Module):
             teacher_pos_contexts = []
             # teacher_context is a list of teacher contexts.
             for teacher_context_i in teacher_context:
-                pos_context, neg_context = torch.chunk(teacher_context_i, 2, dim=0)
-                if pos_context.shape[0] != BS:
-                    breakpoint()
                 teacher_pos_contexts.append(pos_context)
             teacher_context = teacher_pos_contexts
         else:
-            pos_context, neg_context = torch.chunk(teacher_context, 2, dim=0)
-            if pos_context.shape[0] != BS:
                 breakpoint()
             teacher_context = pos_context

     # t: the initial t. We will sample additional (num_denoising_steps - 1) smaller t.
     # same_t_noise_across_instances: when sampling t and noise, use the same t and noise for all instances.
     def forward(self, ddpm_model, x_start, noise, t, teacher_context, negative_context=None,
+                num_denoising_steps=1, force_uses_cfg=False, same_t_noise_across_instances=False,
                 global_t_lb=0, global_t_ub=1000):
         assert num_denoising_steps <= 10
+        # force_uses_cfg overrides p_uses_cfg.
+        if force_uses_cfg > 0:
+            self.uses_cfg = True
+        elif self.p_uses_cfg > 0:
             self.uses_cfg = np.random.rand() < self.p_uses_cfg
         else:
             # p_uses_cfg = 0. Never use CFG.
             self.uses_cfg = False
             self.cfg_scale = 1
+        if self.uses_cfg:
+            # Randomly sample a cfg_scale from cfg_scale_range.
+            self.cfg_scale = np.random.uniform(*self.cfg_scale_range)
+            print(f"Teacher samples CFG scale {self.cfg_scale:.1f}.")
+            if negative_context is not None:
+                negative_context = negative_context[:1].repeat(x_start.shape[0], 1, 1)
+            # if negative_context is None, then teacher_context is a combination of
+            # (one or multiple if unet_ensemble) pos_context and neg_context.
+            # If negative_context is not None, then teacher_context is only pos_context.
+        else:
+            self.cfg_scale = 1
+            print("Teacher does not use CFG.")
+            # If negative_context is None, then teacher_context is either a combination of
+            # (one or multiple if unet_ensemble) pos_context and neg_context, or only pos_context.
+            # Since not uses_cfg, we only need pos_context.
+            # If negative_context is not None, then teacher_context is only pos_context.
+            if negative_context is None:
+                teacher_context = self.extract_pos_context(teacher_context, x_start.shape[0])
         is_context_doubled = 2 if (self.uses_cfg and negative_context is None) else 1
         if self.name == 'unet_ensemble':
             # teacher_context is a list of teacher contexts.
             teacher_pos_contexts = []
             # teacher_context is a list of teacher contexts.
             for teacher_context_i in teacher_context:
+                if teacher_context_i.shape[0] == BS * 2:
+                    pos_context, neg_context = torch.chunk(teacher_context_i, 2, dim=0)
+                elif teacher_context_i.shape[0] == BS:
+                    pos_context = teacher_context_i
+                else:
+                    breakpoint()
                 teacher_pos_contexts.append(pos_context)
             teacher_context = teacher_pos_contexts
         else:
+            if teacher_context.shape[0] == BS * 2:
+                pos_context, neg_context = torch.chunk(teacher_context, 2, dim=0)
+            elif teacher_context.shape[0] == BS:
+                pos_context = teacher_context
+            else:
                 breakpoint()
             teacher_context = pos_context

adaface/util.py CHANGED Viewed

@@ -48,7 +48,7 @@ def perturb_tensor(ts, perturb_std, perturb_std_is_relative=True, keep_norm=Fals
         ts = ts + noise
     if verbose:
-        print(f"Correlations between new and original tensors: {F.cosine_similarity(ts.flatten(), orig_ts.flatten(), dim=0).item():.03f}")
     return ts
@@ -69,7 +69,7 @@ def calc_stats(emb_name, embeddings, mean_dim=-1):
     # Compute it manually.
     l2_loss = ((embeddings - emb_mean) ** 2).mean().sqrt()
     norms = torch.norm(embeddings, dim=1).detach().cpu().numpy()
-    print("L1: %.4f, L2: %.4f" %(l1_loss.item(), l2_loss.item()))
     print("Norms: min: %.4f, max: %.4f, mean: %.4f, std: %.4f" %(norms.min(), norms.max(), norms.mean(), norms.std()))
@@ -80,7 +80,7 @@ class ScaleGrad(torch.autograd.Function):
         ctx.save_for_backward(alpha_, debug)
         output = input_
         if debug:
-            print(f"input: {input_.abs().mean().item()}")
         return output
     @staticmethod
@@ -90,7 +90,7 @@ class ScaleGrad(torch.autograd.Function):
         if ctx.needs_input_grad[0]:
             grad_output2 = grad_output * alpha_
             if debug:
-                print(f"grad_output2: {grad_output2.abs().mean().item()}")
         else:
             grad_output2 = None
         return grad_output2, None, None
@@ -232,8 +232,8 @@ def create_consistentid_pipeline(base_model_path="models/sd15-dste8-vae.safetens
     # consistentID specific modules are still in fp32. Will be converted to fp16
     # later with .to(device, torch_dtype) by the caller.
     pipe.load_ConsistentID_model(
-        consistentID_weight_path="./models/ConsistentID/ConsistentID-v1.bin",
-        bise_net_weight_path="./models/ConsistentID/BiSeNet_pretrained_for_ConsistentID.pth",
     )
     # Avoid passing dtype to ConsistentIDPipeline.from_single_file(),
     # because we've overloaded .to() to convert consistentID specific modules as well,

         ts = ts + noise
     if verbose:
+        print(f"Correlations between new and original tensors: {F.cosine_similarity(ts.flatten(), orig_ts.flatten(), dim=0).detach().item():.03f}")
     return ts
     # Compute it manually.
     l2_loss = ((embeddings - emb_mean) ** 2).mean().sqrt()
     norms = torch.norm(embeddings, dim=1).detach().cpu().numpy()
+    print("L1: %.4f, L2: %.4f" %(l1_loss.detach().item(), l2_loss.detach().item()))
     print("Norms: min: %.4f, max: %.4f, mean: %.4f, std: %.4f" %(norms.min(), norms.max(), norms.mean(), norms.std()))
         ctx.save_for_backward(alpha_, debug)
         output = input_
         if debug:
+            print(f"input: {input_.abs().mean().detach().item()}")
         return output
     @staticmethod
         if ctx.needs_input_grad[0]:
             grad_output2 = grad_output * alpha_
             if debug:
+                print(f"grad_output2: {grad_output2.abs().mean().detach().item()}")
         else:
             grad_output2 = None
         return grad_output2, None, None
     # consistentID specific modules are still in fp32. Will be converted to fp16
     # later with .to(device, torch_dtype) by the caller.
     pipe.load_ConsistentID_model(
+        consistentID_weight_path="models/ConsistentID/ConsistentID-v1.bin",
+        bise_net_weight_path="models/ConsistentID/BiSeNet_pretrained_for_ConsistentID.pth",
     )
     # Avoid passing dtype to ConsistentIDPipeline.from_single_file(),
     # because we've overloaded .to() to convert consistentID specific modules as well,

app.py CHANGED Viewed

@@ -24,11 +24,16 @@ parser = argparse.ArgumentParser()
 parser.add_argument("--adaface_encoder_types", type=str, nargs="+", default=["consistentID", "arc2face"],
                     choices=["arc2face", "consistentID"], help="Type(s) of the ID2Ada prompt encoders")
 parser.add_argument('--adaface_ckpt_path', type=str,
-                    default='models/adaface/VGGface2_HQ_masks2025-03-06T03-31-21_zero3-ada-1000.pt')
 parser.add_argument('--model_style_type', type=str, default='photorealistic',
                     choices=["realistic", "anime", "photorealistic"], help="Type of the base model")
-parser.add_argument("--guidance_scale", type=float, default=8.0,
-                    help="The guidance scale for the diffusion model. Default: 8.0")
 parser.add_argument('--gpu', type=int, default=None)
 parser.add_argument('--ip', type=str, default="0.0.0.0")
@@ -70,7 +75,8 @@ adaface_base_model_path = model_style_type2base_model_path["photorealistic"]
 id_animator = load_model(model_style_type=args.model_style_type, device='cpu')
 adaface = AdaFaceWrapper(pipeline_name="text2img", base_model_path=adaface_base_model_path,
                          adaface_encoder_types=args.adaface_encoder_types,
-                         adaface_ckpt_paths=args.adaface_ckpt_path, device='cpu')
 basedir = os.getcwd()
 savedir = os.path.join(basedir,'samples')
@@ -80,22 +86,22 @@ os.makedirs(savedir, exist_ok=True)
 #os.system(f"rm -rf gradio_cached_examples/")
 def swap_to_gallery(images):
-    # Update uploaded_files_gallery, show files, hide clear_button_column
     # Or:
-    # Update uploaded_init_img_gallery, show init_img_files, hide init_clear_button_column
     return gr.update(value=images, visible=True), gr.update(visible=True), gr.update(value=images, visible=False)
 def remove_back_to_files():
-    # Hide uploaded_files_gallery,    show clear_button_column,      hide files,           reset init_img_selected_idx
     # Or:
-    # Hide uploaded_init_img_gallery, hide init_clear_button_column, show init_img_files,  reset init_img_selected_idx
-    return gr.update(visible=False), gr.update(visible=False), gr.update(value=None, visible=True), gr.update(value="0")
 def get_clicked_image(data: gr.SelectData):
     return data.index
 @spaces.GPU
-def gen_init_images(uploaded_image_paths, prompt, highlight_face, guidance_scale, out_image_count=4):
     if uploaded_image_paths is None:
         print("No image uploaded")
         return None, None, None
@@ -112,7 +118,7 @@ def gen_init_images(uploaded_image_paths, prompt, highlight_face, guidance_scale
     with torch.no_grad():
         adaface_subj_embs = \
             adaface.prepare_adaface_embeddings(image_paths=uploaded_image_paths, face_id_embs=None,
-                                            update_text_encoder=True)
     if adaface_subj_embs is None:
         raise gr.Error(f"Failed to detect any faces! Please try with other images")
@@ -127,6 +133,7 @@ def gen_init_images(uploaded_image_paths, prompt, highlight_face, guidance_scale
         else:
             prompt = "face portrait, " + prompt
     guidance_scale = min(guidance_scale, 5)
     # samples: A list of PIL Image instances.
@@ -134,7 +141,7 @@ def gen_init_images(uploaded_image_paths, prompt, highlight_face, guidance_scale
         samples = adaface(noise, prompt, placeholder_tokens_pos='append',
                           guidance_scale=guidance_scale,
                           out_image_count=out_image_count,
-                          repeat_prompt_for_each_encoder=True,
                           verbose=True)
     face_paths = []
@@ -145,7 +152,7 @@ def gen_init_images(uploaded_image_paths, prompt, highlight_face, guidance_scale
         sample.save(face_path)
         print(f"Generated init image: {face_path}")
-    # Update uploaded_init_img_gallery, update and hide init_img_files, hide init_clear_button_column
     return gr.update(value=face_paths, visible=True), gr.update(value=face_paths, visible=False), gr.update(visible=True)
 @spaces.GPU(duration=90)
@@ -153,7 +160,7 @@ def generate_video(image_container, uploaded_image_paths, init_img_file_paths, i
                    init_image_strength, init_image_final_weight,
                    prompt, negative_prompt, num_steps, video_length, guidance_scale,
                    seed, attn_scale, image_embed_cfg_begin_scale, image_embed_cfg_end_scale,
-                   highlight_face, is_adaface_enabled, adaface_power_scale,
                    id_animator_anneal_steps, progress=gr.Progress(track_tqdm=True)):
     global adaface, id_animator
@@ -195,10 +202,19 @@ def generate_video(image_container, uploaded_image_paths, init_img_file_paths, i
                 adaface.prepare_adaface_embeddings(image_paths=uploaded_image_paths, face_id_embs=None,
                                                    update_text_encoder=True)
             # adaface_prompt_embeds: [1, 77, 768].
             adaface_prompt_embeds, negative_prompt_embeds, _, _ = \
                 adaface.encode_prompt(prompt, placeholder_tokens_pos='append',
-                                      repeat_prompt_for_each_encoder=True,
                                       verbose=True)
         # ID-Animator Image Embedding Initial and End Scales
@@ -267,13 +283,14 @@ with gr.Blocks(css=css, theme=gr.themes.Origin()) as demo:
 <b>Official demo</b> for our working paper <b>AdaFace: A Versatile Text-space Face Encoder for Face Synthesis and Processing</b>.<br>
 ❗️**NOTE**❗️
-- Support switching between three model styles: **Realistic**, **Photorealistic** and **Anime**. **Realistic** is less realistic than **Photorealistic** but has better motions.
 - If you change the model style, please wait for 20~30 seconds for loading new model weight before the model begins to generate images/videos.
 ❗️**Tips**❗️
 - You can upload one or more subject images for generating ID-specific video.
-- If the face loses focus, try enabling "Highlight face".
-- If the motion is weird, e.g., the prompt is "... running", try increasing the number of sampling steps.
 - Usage explanations and demos: [Readme](https://huggingface.co/spaces/adaface-neurips/adaface-animate/blob/main/README2.md).
 - AdaFace Text-to-Image: <a href="https://huggingface.co/spaces/adaface-neurips/adaface" style="display: inline-flex; align-items: center;">
   AdaFace
@@ -285,16 +302,16 @@ with gr.Blocks(css=css, theme=gr.themes.Origin()) as demo:
     with gr.Row():
         with gr.Column():
-            files = gr.File(
                         label="Drag / Select 1 or more photos of a person's face",
                         file_types=["image"],
                         file_count="multiple"
                     )
-            files.GRADIO_CACHE = "/tmp/gradio"
             image_container = gr.Image(label="image container", sources="upload", type="numpy", height=256, visible=False)
-            uploaded_files_gallery = gr.Gallery(label="Subject images", visible=False, columns=3, rows=2, height=300)
             with gr.Column(visible=False) as clear_button_column:
-                remove_and_reupload = gr.ClearButton(value="Remove and upload subject images", components=files, size="sm")
             init_img_files = gr.File(
                             label="[Optional] Generate 4 images and select 1 image",
@@ -305,7 +322,7 @@ with gr.Blocks(css=css, theme=gr.themes.Origin()) as demo:
             init_img_container = gr.Image(label="init image container", sources="upload", type="numpy", height=256, visible=False)
             # Although there's only one image, we still use columns=3, to scale down the image size.
             # Otherwise it will occupy the full width, and the gallery won't show the whole image.
-            uploaded_init_img_gallery = gr.Gallery(label="Init image", visible=False, columns=3, rows=1, height=200)
             # placeholder is just hint, not the real value. So we use "value='0'" instead of "placeholder='0'".
             init_img_selected_idx = gr.Textbox(label="Selected init image index", value="0", visible=False)
@@ -320,7 +337,7 @@ with gr.Blocks(css=css, theme=gr.themes.Origin()) as demo:
                     allow_custom_value=True,
                     choices=[
                             "portrait, highlighted hair, futuristic silver armor suit, confident stance, living room, smiling, head tilted, perfect smooth skin",
-                            "portrait, walking on the beach, sunset",
                             "portrait, in a white apron and chef hat, garnishing a gourmet dish",
                             "portrait, dancing pose among folks in a park, waving hands",
                             "portrait, in iron man costume, the sky ablaze with hues of orange and purple",
@@ -328,18 +345,21 @@ with gr.Blocks(css=css, theme=gr.themes.Origin()) as demo:
                             "portrait, night view of tokyo street, neon light",
                             "portrait, playing guitar on a boat, ocean waves",
                             "portrait, with a passion for reading, curled up with a book in a cozy nook near a window",
-                            "portrait, celebrating new year, fireworks",
-                            "portrait, running pose in a park",
                             "portrait, in space suit, space helmet, walking on mars",
                             "portrait, in superman costume, the sky ablaze with hues of orange and purple"
                     ])
-            highlight_face = gr.Checkbox(label="Highlight face", value=False,
                                          info="Enhance the facial features by prepending 'face portrait' to the prompt",
                                          visible=True)
             init_image_strength = gr.Slider(
-                    label="Init Image Strength",
                     info="How much the init image should influence each frame. 0: no influence (scenes are more dynamic), 3: strongest influence (scenes are more static).",
                     minimum=0,
                     maximum=3,
@@ -352,7 +372,7 @@ with gr.Blocks(css=css, theme=gr.themes.Origin()) as demo:
                     minimum=0,
                     maximum=2,
                     step=0.025,
-                    value=0.1,
                 )
             model_style_type = gr.Dropdown(
@@ -415,7 +435,7 @@ with gr.Blocks(css=css, theme=gr.themes.Origin()) as demo:
                         minimum=0.8,
                         maximum=1.2,
                         step=0.05,
-                        value=1.1,
                         visible=True,
                     )
@@ -443,7 +463,7 @@ with gr.Blocks(css=css, theme=gr.themes.Origin()) as demo:
                         minimum=0,
                         maximum=1,
                         step=0.1,
-                        value=0.1,
                     )
                 id_animator_anneal_steps = gr.Slider(
@@ -464,13 +484,13 @@ with gr.Blocks(css=css, theme=gr.themes.Origin()) as demo:
         with gr.Column():
             result_video = gr.Video(label="Generated Animation", interactive=False)
-        files.upload(fn=swap_to_gallery, inputs=files,     outputs=[uploaded_files_gallery, clear_button_column, files])
-        remove_and_reupload.click(fn=remove_back_to_files, outputs=[uploaded_files_gallery, clear_button_column, files, init_img_selected_idx])
         init_img_files.upload(fn=swap_to_gallery, inputs=init_img_files,
-                              outputs=[uploaded_init_img_gallery, init_clear_button_column, init_img_files])
         remove_init_and_reupload.click(fn=remove_back_to_files,
-                                       outputs=[uploaded_init_img_gallery, init_clear_button_column,
                                                 init_img_files, init_img_selected_idx])
         gen_init.click(fn=check_prompt_and_model_type,
                      inputs=[prompt, model_style_type],outputs=None).success(
@@ -479,10 +499,11 @@ with gr.Blocks(css=css, theme=gr.themes.Origin()) as demo:
                      outputs=seed,
                      queue=False,
                      api_name=False,
-                ).then(fn=gen_init_images, inputs=[uploaded_files_gallery, prompt, highlight_face,
                                                    guidance_scale],
-                       outputs=[uploaded_init_img_gallery, init_img_files, init_clear_button_column])
-        uploaded_init_img_gallery.select(fn=get_clicked_image, inputs=None, outputs=init_img_selected_idx)
         submit.click(fn=check_prompt_and_model_type,
                      inputs=[prompt, model_style_type],outputs=None).success(
@@ -493,11 +514,11 @@ with gr.Blocks(css=css, theme=gr.themes.Origin()) as demo:
             api_name=False,
         ).then(
                  fn=generate_video,
-                 inputs=[image_container, files,
                          init_img_files, init_img_selected_idx, init_image_strength, init_image_final_weight,
                          prompt, negative_prompt, num_steps, video_length, guidance_scale,
                          seed, attn_scale, image_embed_cfg_begin_scale, image_embed_cfg_end_scale,
-                         highlight_face, is_adaface_enabled,
                          adaface_power_scale, id_animator_anneal_steps],
                  outputs=[result_video]
         )

 parser.add_argument("--adaface_encoder_types", type=str, nargs="+", default=["consistentID", "arc2face"],
                     choices=["arc2face", "consistentID"], help="Type(s) of the ID2Ada prompt encoders")
 parser.add_argument('--adaface_ckpt_path', type=str,
+                    default='models/adaface/VGGface2_HQ_masks2025-05-22T17-51-19_zero3-ada-1000.pt')
 parser.add_argument('--model_style_type', type=str, default='photorealistic',
                     choices=["realistic", "anime", "photorealistic"], help="Type of the base model")
+parser.add_argument("--guidance_scale", type=float, default=6.0,
+                    help="The guidance scale for the diffusion model. Default: 6.0")
+parser.add_argument('--num_inference_steps', type=int, default=50,
+                    help="The number of denoising steps for image generation (NOT FOR VIDEOS). Default: 50")
+parser.add_argument('--ablate_prompt_embed_type', type=str, default='ada',
+                    choices=["ada", "arc2face", "consistentID"],
+                    help="Ablate to use the image ID embs instead of Ada embs")
 parser.add_argument('--gpu', type=int, default=None)
 parser.add_argument('--ip', type=str, default="0.0.0.0")
 id_animator = load_model(model_style_type=args.model_style_type, device='cpu')
 adaface = AdaFaceWrapper(pipeline_name="text2img", base_model_path=adaface_base_model_path,
                          adaface_encoder_types=args.adaface_encoder_types,
+                         adaface_ckpt_paths=args.adaface_ckpt_path, device='cpu',
+                         num_inference_steps=args.num_inference_steps)
 basedir = os.getcwd()
 savedir = os.path.join(basedir,'samples')
 #os.system(f"rm -rf gradio_cached_examples/")
 def swap_to_gallery(images):
+    # Update uploaded_ref_files_gallery, show ref_files, hide clear_button_column
     # Or:
+    # Update generated_init_img_gallery, show init_img_files, hide init_clear_button_column
     return gr.update(value=images, visible=True), gr.update(visible=True), gr.update(value=images, visible=False)
 def remove_back_to_files():
+    # Hide uploaded_ref_files_gallery, show clear_button_column,      hide ref_files,       reset init_img_selected_idx
     # Or:
+    # Hide generated_init_img_gallery, hide init_clear_button_column, show init_img_files,  reset init_img_selected_idx
+    return gr.update(value=None, visible=False), gr.update(visible=False), gr.update(value=None, visible=True), gr.update(value="0")
 def get_clicked_image(data: gr.SelectData):
     return data.index
 @spaces.GPU
+def gen_init_images(uploaded_image_paths, prompt, highlight_face, enhance_composition, guidance_scale, out_image_count=4):
     if uploaded_image_paths is None:
         print("No image uploaded")
         return None, None, None
     with torch.no_grad():
         adaface_subj_embs = \
             adaface.prepare_adaface_embeddings(image_paths=uploaded_image_paths, face_id_embs=None,
+                                               update_text_encoder=True)
     if adaface_subj_embs is None:
         raise gr.Error(f"Failed to detect any faces! Please try with other images")
         else:
             prompt = "face portrait, " + prompt
+    # guidance_scale is at most 5.0 for init image generation.
     guidance_scale = min(guidance_scale, 5)
     # samples: A list of PIL Image instances.
         samples = adaface(noise, prompt, placeholder_tokens_pos='append',
                           guidance_scale=guidance_scale,
                           out_image_count=out_image_count,
+                          repeat_prompt_for_each_encoder=enhance_composition,
                           verbose=True)
     face_paths = []
         sample.save(face_path)
         print(f"Generated init image: {face_path}")
+    # Update generated_init_img_gallery, update and hide init_img_files, hide init_clear_button_column
     return gr.update(value=face_paths, visible=True), gr.update(value=face_paths, visible=False), gr.update(visible=True)
 @spaces.GPU(duration=90)
                    init_image_strength, init_image_final_weight,
                    prompt, negative_prompt, num_steps, video_length, guidance_scale,
                    seed, attn_scale, image_embed_cfg_begin_scale, image_embed_cfg_end_scale,
+                   highlight_face, enhance_composition, is_adaface_enabled, adaface_power_scale,
                    id_animator_anneal_steps, progress=gr.Progress(track_tqdm=True)):
     global adaface, id_animator
                 adaface.prepare_adaface_embeddings(image_paths=uploaded_image_paths, face_id_embs=None,
                                                    update_text_encoder=True)
+            if args.ablate_prompt_embed_type != "ada":
+                # Find the prompt_emb_type index in adaface_encoder_types
+                # adaface_encoder_types: ["consistentID", "arc2face"]
+                ablate_prompt_embed_index = args.adaface_encoder_types.index(args.ablate_prompt_embed_type)
+                ablate_prompt_embed_type = f"img{ablate_prompt_embed_index}"
+            else:
+                ablate_prompt_embed_type = "ada"
             # adaface_prompt_embeds: [1, 77, 768].
             adaface_prompt_embeds, negative_prompt_embeds, _, _ = \
                 adaface.encode_prompt(prompt, placeholder_tokens_pos='append',
+                                      ablate_prompt_embed_type=ablate_prompt_embed_type,
+                                      repeat_prompt_for_each_encoder=enhance_composition,
                                       verbose=True)
         # ID-Animator Image Embedding Initial and End Scales
 <b>Official demo</b> for our working paper <b>AdaFace: A Versatile Text-space Face Encoder for Face Synthesis and Processing</b>.<br>
 ❗️**NOTE**❗️
+- Support switching between three model styles: **Photorealistic**, **Realistic** and **Anime**.
 - If you change the model style, please wait for 20~30 seconds for loading new model weight before the model begins to generate images/videos.
 ❗️**Tips**❗️
 - You can upload one or more subject images for generating ID-specific video.
+- "Highlight face" will make the face more prominent in the generated video.
+- "Enhance Composition" will enhance the overall composition of the generated video.
+- "Highlight face" and "Enhance Composition" can be used together.
 - Usage explanations and demos: [Readme](https://huggingface.co/spaces/adaface-neurips/adaface-animate/blob/main/README2.md).
 - AdaFace Text-to-Image: <a href="https://huggingface.co/spaces/adaface-neurips/adaface" style="display: inline-flex; align-items: center;">
   AdaFace
     with gr.Row():
         with gr.Column():
+            ref_files = gr.File(
                         label="Drag / Select 1 or more photos of a person's face",
                         file_types=["image"],
                         file_count="multiple"
                     )
+            ref_files.GRADIO_CACHE = "/tmp/gradio"
             image_container = gr.Image(label="image container", sources="upload", type="numpy", height=256, visible=False)
+            uploaded_ref_files_gallery = gr.Gallery(label="Subject images", visible=False, columns=3, rows=2, height=300)
             with gr.Column(visible=False) as clear_button_column:
+                remove_and_reupload = gr.ClearButton(value="Remove and upload subject images", components=ref_files, size="sm")
             init_img_files = gr.File(
                             label="[Optional] Generate 4 images and select 1 image",
             init_img_container = gr.Image(label="init image container", sources="upload", type="numpy", height=256, visible=False)
             # Although there's only one image, we still use columns=3, to scale down the image size.
             # Otherwise it will occupy the full width, and the gallery won't show the whole image.
+            generated_init_img_gallery = gr.Gallery(label="Init image", visible=False, columns=3, rows=1, height=200)
             # placeholder is just hint, not the real value. So we use "value='0'" instead of "placeholder='0'".
             init_img_selected_idx = gr.Textbox(label="Selected init image index", value="0", visible=False)
                     allow_custom_value=True,
                     choices=[
                             "portrait, highlighted hair, futuristic silver armor suit, confident stance, living room, smiling, head tilted, perfect smooth skin",
+                            "portrait, walking on the beach, front of face, sunset",
                             "portrait, in a white apron and chef hat, garnishing a gourmet dish",
                             "portrait, dancing pose among folks in a park, waving hands",
                             "portrait, in iron man costume, the sky ablaze with hues of orange and purple",
                             "portrait, night view of tokyo street, neon light",
                             "portrait, playing guitar on a boat, ocean waves",
                             "portrait, with a passion for reading, curled up with a book in a cozy nook near a window",
+                            "portrait, celebrating new year alone, fireworks",
+                            "portrait, running pose in a park, front view",
                             "portrait, in space suit, space helmet, walking on mars",
                             "portrait, in superman costume, the sky ablaze with hues of orange and purple"
                     ])
+            highlight_face = gr.Checkbox(label="Highlight face", value=True,
                                          info="Enhance the facial features by prepending 'face portrait' to the prompt",
                                          visible=True)
+            enhance_composition = gr.Checkbox(label="Enhance Composition", value=False,
+                                         info="Enhance the overall composition of the generated video",
+                                         visible=True)
             init_image_strength = gr.Slider(
+                    label="Beginning Strength of Init Image",
                     info="How much the init image should influence each frame. 0: no influence (scenes are more dynamic), 3: strongest influence (scenes are more static).",
                     minimum=0,
                     maximum=3,
                     minimum=0,
                     maximum=2,
                     step=0.025,
+                    value=0.5,
                 )
             model_style_type = gr.Dropdown(
                         minimum=0.8,
                         maximum=1.2,
                         step=0.05,
+                        value=1.05,
                         visible=True,
                     )
                         minimum=0,
                         maximum=1,
                         step=0.1,
+                        value=0.5,
                     )
                 id_animator_anneal_steps = gr.Slider(
         with gr.Column():
             result_video = gr.Video(label="Generated Animation", interactive=False)
+        ref_files.upload(fn=swap_to_gallery, inputs=ref_files,     outputs=[uploaded_ref_files_gallery, clear_button_column, ref_files])
+        remove_and_reupload.click(fn=remove_back_to_files, outputs=[uploaded_ref_files_gallery, clear_button_column, ref_files, init_img_selected_idx])
         init_img_files.upload(fn=swap_to_gallery, inputs=init_img_files,
+                              outputs=[generated_init_img_gallery, init_clear_button_column, init_img_files])
         remove_init_and_reupload.click(fn=remove_back_to_files,
+                                       outputs=[generated_init_img_gallery, init_clear_button_column,
                                                 init_img_files, init_img_selected_idx])
         gen_init.click(fn=check_prompt_and_model_type,
                      inputs=[prompt, model_style_type],outputs=None).success(
                      outputs=seed,
                      queue=False,
                      api_name=False,
+                ).then(fn=gen_init_images, inputs=[uploaded_ref_files_gallery, prompt,
+                                                   highlight_face, enhance_composition,
                                                    guidance_scale],
+                       outputs=[generated_init_img_gallery, init_img_files, init_clear_button_column])
+        generated_init_img_gallery.select(fn=get_clicked_image, inputs=None, outputs=init_img_selected_idx)
         submit.click(fn=check_prompt_and_model_type,
                      inputs=[prompt, model_style_type],outputs=None).success(
             api_name=False,
         ).then(
                  fn=generate_video,
+                 inputs=[image_container, ref_files,
                          init_img_files, init_img_selected_idx, init_image_strength, init_image_final_weight,
                          prompt, negative_prompt, num_steps, video_length, guidance_scale,
                          seed, attn_scale, image_embed_cfg_begin_scale, image_embed_cfg_end_scale,
+                         highlight_face, enhance_composition, is_adaface_enabled,
                          adaface_power_scale, id_animator_anneal_steps],
                  outputs=[result_video]
         )