File size: 11,972 Bytes
ac40637
26557da
c088540
 
 
 
26557da
 
 
 
 
 
 
 
 
 
 
927c90e
 
a5dfe07
927c90e
 
 
 
 
 
 
 
 
26557da
 
 
 
 
 
 
 
 
 
 
 
 
8666f4d
 
e7db0ca
8666f4d
 
 
 
 
 
 
bb96703
26557da
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8666f4d
26557da
 
 
 
 
 
 
 
c4d0858
26557da
 
 
 
8666f4d
 
 
 
 
 
bb96703
26557da
 
 
 
4cd0de2
8666f4d
a5dfe07
 
8666f4d
c088540
26557da
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0a50960
26557da
 
 
 
 
c4d0858
c088540
1e388da
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26557da
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c4d0858
26557da
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b155875
c4d0858
26557da
 
 
 
 
 
c088540
b155875
26557da
 
 
a5dfe07
b155875
26557da
 
8666f4d
c088540
26557da
 
 
 
1e388da
de48061
26557da
 
 
c088540
26557da
1e388da
 
 
 
1640678
1e388da
269c36d
1e388da
 
 
 
 
1640678
1e388da
269c36d
1e388da
 
 
 
 
1640678
1e388da
269c36d
1e388da
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8a1c704
 
1e388da
 
26557da
 
 
 
 
 
 
 
 
 
 
 
1e388da
26557da
 
 
8a1c704
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
import spaces
import gradio as gr
import os

is_shared_ui = True if "fffiloni/Stand-In" in os.environ['SPACE_ID'] else False

import torch
import time
from PIL import Image
import tempfile
import os

from data.video import save_video
from wan_loader import load_wan_pipe
from models.set_condition_branch import set_stand_in
from preprocessor import FaceProcessor

from huggingface_hub import snapshot_download

print("Downloading models, please wait...")

snapshot_download("Wan-AI/Wan2.1-T2V-14B", local_dir="checkpoints/base_model/")

snapshot_download(
    "DIAMONIK7777/antelopev2", 
    local_dir="checkpoints/antelopev2/models/antelopev2"
)
snapshot_download("BowenXue/Stand-In", local_dir="checkpoints/Stand-In/")

try:
    ANTELOPEV2_PATH = "checkpoints/antelopev2"
    BASE_MODEL_PATH = "checkpoints/base_model/"
    LORA_MODEL_PATH = "checkpoints/Stand-In/Stand-In_wan2.1_T2V_14B_ver1.0.ckpt"

    if not os.path.exists(ANTELOPEV2_PATH):
        raise FileNotFoundError(
            f"AntelopeV2 checkpoint not found at: {ANTELOPEV2_PATH}"
        )
    if not os.path.exists(BASE_MODEL_PATH):
        raise FileNotFoundError(f"Base model not found at: {BASE_MODEL_PATH}")
    if not os.path.exists(LORA_MODEL_PATH):
        raise FileNotFoundError(f"LoRA model not found at: {LORA_MODEL_PATH}")

    print(f"Is CUDA available: {torch.cuda.is_available()}")
    if torch.cuda.is_available() and not is_shared_ui:
        face_processor = FaceProcessor(antelopv2_path=ANTELOPEV2_PATH)
        pipe = load_wan_pipe(base_path=BASE_MODEL_PATH, torch_dtype=torch.bfloat16)
        set_stand_in(pipe, model_path=LORA_MODEL_PATH)
        print("Model loaded successfully!")
    else:
        print("Will load models on ZeroGPU on inference if available")

 
except Exception as e:
    print(f"Model loading failed: {e}")
    with gr.Blocks() as demo:
        gr.Markdown("# Error: Model Loading Failed")
        gr.Markdown(f"""
        Please check the following:
        1.  Make sure the checkpoint files are placed in the correct directory.
        2.  Ensure all dependencies are properly installed.
        3.  Check the console output for detailed error information.
        
        **Error details**: {e}
        """)
    demo.launch()
    exit()

@spaces.GPU(duration=300)
def generate_video(
    pil_image: Image.Image,
    prompt: str,
    seed: int,
    negative_prompt: str,
    num_steps: int,
    fps: int,
    quality: int,
    progress=gr.Progress(track_tqdm=True)
):
    if pil_image is None:
        raise gr.Error("Please upload a face image first!")

    if is_shared_ui:
        print("Loading models...")
        face_processor = FaceProcessor(antelopv2_path=ANTELOPEV2_PATH)
        pipe = load_wan_pipe(base_path=BASE_MODEL_PATH, torch_dtype=torch.bfloat16)
        set_stand_in(pipe, model_path=LORA_MODEL_PATH)
        print("Model loaded successfully!")

    print("Processing face...")
    ip_image = face_processor.process(pil_image)
    print("Face processing completed.")

    if is_shared_ui: 
        num_steps = 10
        quality = 6
        #fps = 12
        
    
    print("Generating video...")
    start_time = time.time()
    video = pipe(
        prompt=prompt,
        negative_prompt=negative_prompt,
        seed=int(seed),
        ip_image=ip_image,
        num_inference_steps=int(num_steps),
        tiled=False,
    )
    end_time = time.time()
    print(f"Video generated in {end_time - start_time:.2f} seconds.")

    with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as temp_file:
        video_path = temp_file.name
        save_video(video, video_path, fps=int(fps), quality=quality)
        print(f"Video saved to: {video_path}")
        return video_path


with gr.Blocks() as demo:
    gr.Markdown(
        """
        # Stand-In IP2V
        """
    )
    gr.Markdown("A Lightweight and Plug-and-Play Identity Control for Video Generation")
    gr.Markdown("On fffiloni's shared UI, advanced settings are disabled to optimize best results on ZeroGPU.")
    gr.HTML("""
    <div style="display:flex;column-gap:4px;">
        <a href="https://github.com/WeChatCV/Stand-In">
            <img src='https://img.shields.io/badge/GitHub-Repo-blue'>
        </a> 
        <a href="https://stand-in-video.github.io/">
            <img src='https://img.shields.io/badge/Project-Page-green'>
        </a>
        <a href="https://arxiv.org/abs/2508.07901">
            <img src='https://img.shields.io/badge/ArXiv-Paper-red'>
        </a>
        <a href="https://huggingface.co/spaces/fffiloni/Stand-In?duplicate=true">
            <img src="https://huggingface.co/datasets/huggingface/badges/resolve/main/duplicate-this-space-sm.svg" alt="Duplicate this Space">
        </a>
    </div>
    """)

    with gr.Row():
        with gr.Column(scale=1):
            gr.Markdown("### 1. Upload a Face Image")
            input_image = gr.Image(
                label="Upload Image",
                type="pil",
                image_mode="RGB",
                height=300,
            )

            gr.Markdown("### 2. Enter Core Parameters")
            input_prompt = gr.Textbox(
                label="Prompt",
                lines=4,
                value="A man sits comfortably at his desk, facing the camera, as if conversing with a friend or family member in front of a screen. His eyes are focused yet gentle, and a natural smile plays on his lips. The background is his meticulously decorated personal space, with photos and a world map on the wall, conveying a sense of intimacy and modern communication.",
                placeholder="Please enter a detailed description of the scene, character actions, expressions, etc...",
            )

            input_seed = gr.Slider(
                label="Seed",
                minimum=0,
                maximum=100000,
                step=1,
                value=0,
                info="The same seed and parameters will generate the same result.",
            )

            with gr.Accordion("Advanced Options", open=False):
                input_negative_prompt = gr.Textbox(
                    label="Negative Prompt",
                    lines=3,
                    interactive = False if is_shared_ui else True,
                    value="Vibrant colors, overexposure, static, blurred details, subtitles, style, artwork, painting, still image, overall grayness, worst quality, low quality, JPEG compression residue, ugly, mutilated, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured, malformed limbs, fused fingers, still image, cluttered background, three legs, crowded background, walking backwards",
                )
                input_steps = gr.Slider(
                    label="Inference Steps",
                    minimum=10,
                    maximum=50,
                    step=1,
                    value=10,
                    interactive = False if is_shared_ui else True,
                    info="More steps may improve details but will take longer to generate.",
                )
                output_fps = gr.Slider(
                    label="Video FPS", minimum=10, maximum=30, step=1, value=25,
                    interactive = False if is_shared_ui else True,
                )
                output_quality = gr.Slider(
                    label="Video Quality", minimum=1, maximum=10, step=1, value=6,
                    interactive = False if is_shared_ui else True
                )

            generate_btn = gr.Button("Generate Video", variant="primary")


        with gr.Column(scale=2):
            gr.Markdown("### 3. View Generated Result")
            output_video = gr.Video(
                label="Generated Video",
                #height=480,
            )

            examples = gr.Examples(
                examples = [
                    [
                        "./examples/standin_example1.jpg", 
                        "In a corridor where the walls ripple like water, a woman reaches out to touch the flowing surface, causing circles of ripples to spread. The camera moves from a medium shot to a close-up, capturing her curious expression as she sees her distorted reflection.",
                        42, 
                        "Vibrant colors, overexposure, static, blurred details, subtitles, style, artwork, painting, still image, overall grayness, worst quality, low quality, JPEG compression residue, ugly, mutilated, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured, malformed limbs, fused fingers, still image, cluttered background, three legs, crowded background, walking backwards",
                        20,
                        25, 9
                    ],
                    [
                        "./examples/standin_example4.jpg",
                        "A man in a white lab coat stands in front of a laboratory bench. Having just completed a critical step, he turns toward the camera, his eyes shining with the excitement of discovery. The background is filled with precise instruments and an array of beakers and test tubes, capturing the rigor and allure of scientific exploration.",
                        420, 
                        "Vibrant colors, overexposure, static, blurred details, subtitles, style, artwork, painting, still image, overall grayness, worst quality, low quality, JPEG compression residue, ugly, mutilated, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured, malformed limbs, fused fingers, still image, cluttered background, three legs, crowded background, walking backwards",
                        20,
                        25, 9
                    ],
                    [
                        "./examples/standin_example8.png", 
                        "The video features a man standing at an easel, focused intently as his brush dances across the canvas. His expression is one of deep concentration, with a hint of satisfaction as each brushstroke adds color and form. He wears a paint-splattered apron, and his hands move with confident precision. The setting, filled with scattered art supplies, open paint tubes, and unfinished sketches pinned to the wall, suggests an artist's studio. A large window on one side allows sunlight to stream in, casting a soft glow across the room and illuminating the colors on his canvas. The atmosphere is creative and inspired, with the man's intense focus and the lively colors on the canvas indicating a moment of artistic passion and expression.",
                        4200, 
                        "Vibrant colors, overexposure, static, blurred details, subtitles, style, artwork, painting, still image, overall grayness, worst quality, low quality, JPEG compression residue, ugly, mutilated, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured, malformed limbs, fused fingers, still image, cluttered background, three legs, crowded background, walking backwards",
                        20,
                        25, 9
                    ]
                ],
                fn=generate_video,
                inputs=[
                    input_image,
                    input_prompt,
                    input_seed,
                    input_negative_prompt,
                    input_steps,
                    output_fps,
                    output_quality,
                ],
                outputs=output_video,
                cache_examples=False,
               # cache_mode="lazy"
            )
    
    generate_btn.click(
        fn=generate_video,
        inputs=[
            input_image,
            input_prompt,
            input_seed,
            input_negative_prompt,
            input_steps,
            output_fps,
            output_quality,
        ],
        outputs=output_video,
        #api_name="generate_video",
    )

if __name__ == "__main__":
    demo.queue().launch(ssr_mode=False, show_error=True, show_api=False)