File size: 4,225 Bytes
26557da
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
import torch
from data.video import save_video
from wan_loader import load_wan_pipe
from models.set_condition_branch import set_stand_in
from preprocessor import FaceProcessor, VideoMaskGenerator
import argparse

parser = argparse.ArgumentParser()

parser.add_argument(
    "--ip_image",
    type=str,
    default="test/input/ruonan.jpg",
    help="Input face image path or URL",
)
parser.add_argument(
    "--input_video",
    type=str,
    default="test/input/woman.mp4",
    help="Input video path",
)
parser.add_argument(
    "--denoising_strength",
    type=float,
    default=0.85,
    help="The lower denoising strength represents a higher similarity to the original video.",
)
parser.add_argument(
    "--prompt",
    type=str,
    default="The video features a woman standing in front of a large screen displaying the words "
    "Tech Minute"
    " and the logo for CNET. She is wearing a purple top and appears to be presenting or speaking about technology-related topics. The background includes a cityscape with tall buildings, suggesting an urban setting. The woman seems to be engaged in a discussion or providing information on technology news or trends. The overall atmosphere is professional and informative, likely aimed at educating viewers about the latest developments in the tech industry.",
    help="Text prompt for video generation",
)
parser.add_argument(
    "--output",
    type=str,
    default="test/output/ruonan.mp4",
    help="Output video file path",
)
parser.add_argument(
    "--seed", type=int, default=0, help="Random seed for reproducibility"
)
parser.add_argument(
    "--num_inference_steps", type=int, default=20, help="Number of inference steps"
)
parser.add_argument(
    "--force_background_consistency",
    type=bool,
    default=False,
    help="Set to True to force background consistency across generated frames.",
)

parser.add_argument(
    "--negative_prompt",
    type=str,
    default="色调艳丽,过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,整体发灰,最差质量,低质量,JPEG压缩残留,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部,畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走",
    help="Negative prompt to avoid unwanted features",
)
parser.add_argument("--tiled", action="store_true", help="Enable tiled mode")
parser.add_argument(
    "--fps", type=int, default=25, help="Frames per second for output video"
)
parser.add_argument(
    "--quality", type=int, default=9, help="Output video quality (1-9)"
)
parser.add_argument(
    "--base_path",
    type=str,
    default="checkpoints/base_model/",
    help="Path to base model checkpoint",
)
parser.add_argument(
    "--stand_in_path",
    type=str,
    default="checkpoints/Stand-In/Stand-In_wan2.1_T2V_14B_ver1.0.ckpt",
    help="Path to LoRA weights checkpoint",
)
parser.add_argument(
    "--antelopv2_path",
    type=str,
    default="checkpoints/antelopev2",
    help="Path to AntelopeV2 model checkpoint",
)

args = parser.parse_args()

face_processor = FaceProcessor(antelopv2_path=args.antelopv2_path)
videomask_generator = VideoMaskGenerator(antelopv2_path=args.antelopv2_path)

ip_image, ip_image_rgba = face_processor.process(args.ip_image, extra_input=True)
input_video, face_mask, width, height, num_frames = videomask_generator.process(args.input_video, ip_image_rgba, random_horizontal_flip_chance=0.05, dilation_kernel_size=10)

pipe = load_wan_pipe(
    base_path=args.base_path, face_swap=True, torch_dtype=torch.bfloat16
)

set_stand_in(
    pipe,
    model_path=args.stand_in_path,
)

video = pipe(
    prompt=args.prompt,
    negative_prompt=args.negative_prompt,
    seed=args.seed,
    width=width,
    height=height,
    num_frames=num_frames,
    denoising_strength=args.denoising_strength,
    ip_image=ip_image,
    face_mask=face_mask,
    input_video=input_video,
    num_inference_steps=args.num_inference_steps,
    tiled=args.tiled,
    force_background_consistency=args.force_background_consistency
)
save_video(video, args.output, fps=args.fps, quality=args.quality)