import torch import os import numpy as np import math import decord from tqdm import tqdm import pathlib from PIL import Image from diffusers_helper.models.hunyuan_video_packed import HunyuanVideoTransformer3DModelPacked from diffusers_helper.memory import DynamicSwapInstaller from diffusers_helper.utils import resize_and_center_crop from diffusers_helper.bucket_tools import find_nearest_bucket from diffusers_helper.hunyuan import vae_encode, vae_decode from .base_generator import BaseModelGenerator class VideoBaseModelGenerator(BaseModelGenerator): """ Model generator for the Video extension of the Original HunyuanVideo model. This generator accepts video input instead of a single image. """ def __init__(self, **kwargs): """ Initialize the Video model generator. """ super().__init__(**kwargs) self.model_name = None # Subclass Model Specific self.model_path = None # Subclass Model Specific self.model_repo_id_for_cache = None # Subclass Model Specific self.full_video_latents = None # For context, set by worker() when available self.resolution = 640 # Default resolution self.no_resize = False # Default to resize self.vae_batch_size = 16 # Default VAE batch size # Import decord and tqdm here to avoid import errors if not installed try: import decord from tqdm import tqdm self.decord = decord self.tqdm = tqdm except ImportError: print("Warning: decord or tqdm not installed. Video processing will not work.") self.decord = None self.tqdm = None def get_model_name(self): """ Get the name of the model. """ return self.model_name def load_model(self): """ Load the Video transformer model. If offline mode is True, attempts to load from a local snapshot. """ print(f"Loading {self.model_name} Transformer...") path_to_load = self.model_path # Initialize with the default path if self.offline: path_to_load = self._get_offline_load_path() # Calls the method in BaseModelGenerator # Create the transformer model self.transformer = HunyuanVideoTransformer3DModelPacked.from_pretrained( path_to_load, torch_dtype=torch.bfloat16 ).cpu() # Configure the model self.transformer.eval() self.transformer.to(dtype=torch.bfloat16) self.transformer.requires_grad_(False) # Set up dynamic swap if not in high VRAM mode if not self.high_vram: DynamicSwapInstaller.install_model(self.transformer, device=self.gpu) else: # In high VRAM mode, move the entire model to GPU self.transformer.to(device=self.gpu) print(f"{self.model_name} Transformer Loaded from {path_to_load}.") return self.transformer def min_real_frames_to_encode(self, real_frames_available_count): """ Minimum number of real frames to encode is the maximum number of real frames used for generation context. The number of latents could be calculated as below for video F1, but keeping it simple for now by hardcoding the Video F1 value at max_latents_used_for_context = 27. # Calculate the number of latent frames to encode from the end of the input video num_frames_to_encode_from_end = 1 # Default minimum if model_type == "Video": # Max needed is 1 (clean_latent_pre) + 2 (max 2x) + 16 (max 4x) = 19 num_frames_to_encode_from_end = 19 elif model_type == "Video F1": ui_num_cleaned_frames = job_params.get('num_cleaned_frames', 5) # Max effective_clean_frames based on VideoF1ModelGenerator's logic. # Max num_clean_frames from UI is 10 (modules/interface.py). # Max effective_clean_frames = 10 - 1 = 9. # total_context_frames = num_4x_frames + num_2x_frames + effective_clean_frames # Max needed = 16 (max 4x) + 2 (max 2x) + 9 (max effective_clean_frames) = 27 num_frames_to_encode_from_end = 27 Note: 27 latents ~ 108 real frames = 3.6 seconds at 30 FPS. Note: 19 latents ~ 76 real frames ~ 2.5 seconds at 30 FPS. """ max_latents_used_for_context = 27 if self.get_model_name() == "Video": max_latents_used_for_context = 27 # Weird results on 19 elif self.get_model_name() == "Video F1": max_latents_used_for_context = 27 # Enough for even Video F1 with cleaned_frames input of 10 else: print("======================================================") print(f" ***** Warning: Unsupported video extension model type: {self.get_model_name()}.") print( " ***** Using default max latents {max_latents_used_for_context} for context.") print( " ***** Please report to the developers if you see this message:") print( " ***** Discord: https://discord.gg/8Z2c3a4 or GitHub: https://github.com/colinurbs/FramePack-Studio") print("======================================================") # Probably better to press on with Video F1 max vs exception? # raise ValueError(f"Unsupported video extension model type: {self.get_model_name()}") latent_size_factor = 4 # real frames to latent frames conversion factor max_real_frames_used_for_context = max_latents_used_for_context * latent_size_factor # Shortest of available frames and max frames used for context trimmed_real_frames_count = min(real_frames_available_count, max_real_frames_used_for_context) if trimmed_real_frames_count < real_frames_available_count: print(f"Truncating video frames from {real_frames_available_count} to {trimmed_real_frames_count}, enough to populate context") # Truncate to nearest latent size (multiple of 4) frames_to_encode_count = (trimmed_real_frames_count // latent_size_factor) * latent_size_factor if frames_to_encode_count != trimmed_real_frames_count: print(f"Truncating video frames from {trimmed_real_frames_count} to {frames_to_encode_count}, for latent size compatibility") return frames_to_encode_count def extract_video_frames(self, is_for_encode, video_path, resolution, no_resize=False, input_files_dir=None): """ Extract real frames from a video, resized and center cropped as numpy array (T, H, W, C). Args: is_for_encode: If True, results are capped at maximum frames used for context, and aligned to 4-frame latent requirement. video_path: Path to the input video file. resolution: Target resolution for resizing frames. no_resize: Whether to use the original video resolution. input_files_dir: Directory for input files that won't be cleaned up. Returns: A tuple containing: - input_frames_resized_np: All input frames resized and center cropped as numpy array (T, H, W, C) - fps: Frames per second of the input video - target_height: Target height of the video - target_width: Target width of the video """ def time_millis(): import time return time.perf_counter() * 1000.0 # Convert seconds to milliseconds encode_start_time_millis = time_millis() # Normalize video path for Windows compatibility video_path = str(pathlib.Path(video_path).resolve()) print(f"Processing video: {video_path}") # Check if the video is in the temp directory and if we have an input_files_dir if input_files_dir and "temp" in video_path: # Check if there's a copy of this video in the input_files_dir filename = os.path.basename(video_path) input_file_path = os.path.join(input_files_dir, filename) # If the file exists in input_files_dir, use that instead if os.path.exists(input_file_path): print(f"Using video from input_files_dir: {input_file_path}") video_path = input_file_path else: # If not, copy it to input_files_dir to prevent it from being deleted try: from diffusers_helper.utils import generate_timestamp safe_filename = f"{generate_timestamp()}_{filename}" input_file_path = os.path.join(input_files_dir, safe_filename) import shutil shutil.copy2(video_path, input_file_path) print(f"Copied video to input_files_dir: {input_file_path}") video_path = input_file_path except Exception as e: print(f"Error copying video to input_files_dir: {e}") try: # Load video and get FPS print("Initializing VideoReader...") vr = decord.VideoReader(video_path) fps = vr.get_avg_fps() # Get input video FPS num_real_frames = len(vr) print(f"Video loaded: {num_real_frames} frames, FPS: {fps}") # Read frames print("Reading video frames...") total_frames_in_video_file = len(vr) if is_for_encode: print(f"Using minimum real frames to encode: {self.min_real_frames_to_encode(total_frames_in_video_file)}") num_real_frames = self.min_real_frames_to_encode(total_frames_in_video_file) # else left as all frames -- len(vr) with no regard for trimming or latent alignment # RT_BORG: Retaining this commented code for reference. # pftq encoder discarded truncated frames from the end of the video. # frames = vr.get_batch(range(num_real_frames)).asnumpy() # Shape: (num_real_frames, height, width, channels) # RT_BORG: Retaining this commented code for reference. # pftq retained the entire encoded video. # Truncate to nearest latent size (multiple of 4) # latent_size_factor = 4 # num_frames = (num_real_frames // latent_size_factor) * latent_size_factor # if num_frames != num_real_frames: # print(f"Truncating video from {num_real_frames} to {num_frames} frames for latent size compatibility") # num_real_frames = num_frames # Discard truncated frames from the beginning of the video, retaining the last num_real_frames # This ensures a smooth transition from the input video to the generated video start_frame_index = total_frames_in_video_file - num_real_frames frame_indices_to_extract = range(start_frame_index, total_frames_in_video_file) frames = vr.get_batch(frame_indices_to_extract).asnumpy() # Shape: (num_real_frames, height, width, channels) print(f"Frames read: {frames.shape}") # Get native video resolution native_height, native_width = frames.shape[1], frames.shape[2] print(f"Native video resolution: {native_width}x{native_height}") # Use native resolution if height/width not specified, otherwise use provided values target_height = native_height target_width = native_width # Adjust to nearest bucket for model compatibility if not no_resize: target_height, target_width = find_nearest_bucket(target_height, target_width, resolution=resolution) print(f"Adjusted resolution: {target_width}x{target_height}") else: print(f"Using native resolution without resizing: {target_width}x{target_height}") # Preprocess input frames to match desired resolution input_frames_resized_np = [] for i, frame in tqdm(enumerate(frames), desc="Processing Video Frames", total=num_real_frames, mininterval=0.1): frame_np = resize_and_center_crop(frame, target_width=target_width, target_height=target_height) input_frames_resized_np.append(frame_np) input_frames_resized_np = np.stack(input_frames_resized_np) # Shape: (num_real_frames, height, width, channels) print(f"Frames preprocessed: {input_frames_resized_np.shape}") resized_frames_time_millis = time_millis() if (False): # We really need a logger print("======================================================") memory_bytes = input_frames_resized_np.nbytes memory_kb = memory_bytes / 1024 memory_mb = memory_kb / 1024 print(f" ***** input_frames_resized_np: {input_frames_resized_np.shape}") print(f" ***** Memory usage: {int(memory_mb)} MB") duration_ms = resized_frames_time_millis - encode_start_time_millis print(f" ***** Time taken to process frames tensor: {duration_ms / 1000.0:.2f} seconds") print("======================================================") return input_frames_resized_np, fps, target_height, target_width except Exception as e: print(f"Error in extract_video_frames: {str(e)}") raise # RT_BORG: video_encode produce and return end_of_input_video_latent and end_of_input_video_image_np # which are not needed for Video models without end frame processing. # But these should be inexpensive and it's easier to keep the code uniform. @torch.no_grad() def video_encode(self, video_path, resolution, no_resize=False, vae_batch_size=16, device=None, input_files_dir=None): """ Encode a video into latent representations using the VAE. Args: video_path: Path to the input video file. resolution: Target resolution for resizing frames. no_resize: Whether to use the original video resolution. vae_batch_size: Number of frames to process per batch. device: Device for computation (e.g., "cuda"). input_files_dir: Directory for input files that won't be cleaned up. Returns: A tuple containing: - start_latent: Latent of the first frame - input_image_np: First frame as numpy array - history_latents: Latents of all frames - fps: Frames per second of the input video - target_height: Target height of the video - target_width: Target width of the video - input_video_pixels: Video frames as tensor - end_of_input_video_image_np: Last frame as numpy array - input_frames_resized_np: All input frames resized and center cropped as numpy array (T, H, W, C) """ encoding = True # Flag to indicate this is for encoding input_frames_resized_np, fps, target_height, target_width = self.extract_video_frames(encoding, video_path, resolution, no_resize, input_files_dir) try: if device is None: device = self.gpu # Check CUDA availability and fallback to CPU if needed if device == "cuda" and not torch.cuda.is_available(): print("CUDA is not available, falling back to CPU") device = "cpu" # Save first frame for CLIP vision encoding input_image_np = input_frames_resized_np[0] end_of_input_video_image_np = input_frames_resized_np[-1] # Convert to tensor and normalize to [-1, 1] print("Converting frames to tensor...") frames_pt = torch.from_numpy(input_frames_resized_np).float() / 127.5 - 1 frames_pt = frames_pt.permute(0, 3, 1, 2) # Shape: (num_real_frames, channels, height, width) frames_pt = frames_pt.unsqueeze(0) # Shape: (1, num_real_frames, channels, height, width) frames_pt = frames_pt.permute(0, 2, 1, 3, 4) # Shape: (1, channels, num_real_frames, height, width) print(f"Tensor shape: {frames_pt.shape}") # Save pixel frames for use in worker input_video_pixels = frames_pt.cpu() # Move to device print(f"Moving tensor to device: {device}") frames_pt = frames_pt.to(device) print("Tensor moved to device") # Move VAE to device print(f"Moving VAE to device: {device}") self.vae.to(device) print("VAE moved to device") # Encode frames in batches print(f"Encoding input video frames in VAE batch size {vae_batch_size}") latents = [] self.vae.eval() with torch.no_grad(): frame_count = frames_pt.shape[2] step_count = math.ceil(frame_count / vae_batch_size) for i in tqdm(range(0, frame_count, vae_batch_size), desc="Encoding video frames", total=step_count, mininterval=0.1): batch = frames_pt[:, :, i:i + vae_batch_size] # Shape: (1, channels, batch_size, height, width) try: # Log GPU memory before encoding if device == "cuda": free_mem = torch.cuda.memory_allocated() / 1024**3 batch_latent = vae_encode(batch, self.vae) # Synchronize CUDA to catch issues if device == "cuda": torch.cuda.synchronize() latents.append(batch_latent) except RuntimeError as e: print(f"Error during VAE encoding: {str(e)}") if device == "cuda" and "out of memory" in str(e).lower(): print("CUDA out of memory, try reducing vae_batch_size or using CPU") raise # Concatenate latents print("Concatenating latents...") history_latents = torch.cat(latents, dim=2) # Shape: (1, channels, frames, height//8, width//8) print(f"History latents shape: {history_latents.shape}") # Get first frame's latent start_latent = history_latents[:, :, :1] # Shape: (1, channels, 1, height//8, width//8) print(f"Start latent shape: {start_latent.shape}") if (False): # We really need a logger print("======================================================") memory_bytes = history_latents.nbytes memory_kb = memory_bytes / 1024 memory_mb = memory_kb / 1024 print(f" ***** history_latents: {history_latents.shape}") print(f" ***** Memory usage: {int(memory_mb)} MB") print("======================================================") # Move VAE back to CPU to free GPU memory if device == "cuda": self.vae.to(self.cpu) torch.cuda.empty_cache() print("VAE moved back to CPU, CUDA cache cleared") return start_latent, input_image_np, history_latents, fps, target_height, target_width, input_video_pixels, end_of_input_video_image_np, input_frames_resized_np except Exception as e: print(f"Error in video_encode: {str(e)}") raise # RT_BORG: Currently history_latents is initialized within worker() for all Video models as history_latents = video_latents # So it is a coding error to call prepare_history_latents() here. # Leaving in place as we will likely use it post-refactoring. def prepare_history_latents(self, height, width): """ Prepare the history latents tensor for the Video model. Args: height: The height of the image width: The width of the image Returns: The initialized history latents tensor """ raise TypeError( f"Error: '{self.__class__.__name__}.prepare_history_latents' should not be called " "on the Video models. history_latents should be initialized within worker() for all Video models " "as history_latents = video_latents." ) def prepare_indices(self, latent_padding_size, latent_window_size): """ Prepare the indices for the Video model. Args: latent_padding_size: The size of the latent padding latent_window_size: The size of the latent window Returns: A tuple of (clean_latent_indices, latent_indices, clean_latent_2x_indices, clean_latent_4x_indices) """ raise TypeError( f"Error: '{self.__class__.__name__}.prepare_indices' should not be called " "on the Video models. Currently video models each have a combined method: _prepare_clean_latents_and_indices." ) def set_full_video_latents(self, video_latents): """ Set the full video latents for context. Args: video_latents: The full video latents """ self.full_video_latents = video_latents def prepare_clean_latents(self, start_latent, history_latents): """ Prepare the clean latents for the Video model. Args: start_latent: The start latent history_latents: The history latents Returns: A tuple of (clean_latents, clean_latents_2x, clean_latents_4x) """ raise TypeError( f"Error: '{self.__class__.__name__}.prepare_indices' should not be called " "on the Video models. Currently video models each have a combined method: _prepare_clean_latents_and_indices." ) def get_section_latent_frames(self, latent_window_size, is_last_section): """ Get the number of section latent frames for the Video model. Args: latent_window_size: The size of the latent window is_last_section: Whether this is the last section Returns: The number of section latent frames """ return latent_window_size * 2 def combine_videos(self, source_video_path, generated_video_path, output_path): """ Combine the source video with the generated video side by side. Args: source_video_path: Path to the source video generated_video_path: Path to the generated video output_path: Path to save the combined video Returns: Path to the combined video """ try: import os import subprocess print(f"Combining source video {source_video_path} with generated video {generated_video_path}") # Get the ffmpeg executable from the VideoProcessor class from modules.toolbox.toolbox_processor import VideoProcessor from modules.toolbox.message_manager import MessageManager # Create a message manager for logging message_manager = MessageManager() # Import settings from main module try: from __main__ import settings video_processor = VideoProcessor(message_manager, settings.settings) except ImportError: # Fallback to creating a new settings object from modules.settings import Settings settings = Settings() video_processor = VideoProcessor(message_manager, settings.settings) # Get the ffmpeg executable ffmpeg_exe = video_processor.ffmpeg_exe if not ffmpeg_exe: print("FFmpeg executable not found. Cannot combine videos.") return None print(f"Using ffmpeg at: {ffmpeg_exe}") # Create a temporary directory for the filter script import tempfile temp_dir = tempfile.gettempdir() filter_script_path = os.path.join(temp_dir, f"filter_script_{os.path.basename(output_path)}.txt") # Get video dimensions using ffprobe def get_video_info(video_path): cmd = [ ffmpeg_exe, "-i", video_path, "-hide_banner", "-loglevel", "error" ] # Run ffmpeg to get video info (it will fail but output info to stderr) result = subprocess.run(cmd, capture_output=True, text=True) # Parse the output to get dimensions width = height = None for line in result.stderr.split('\n'): if 'Video:' in line: # Look for dimensions like 640x480 import re match = re.search(r'(\d+)x(\d+)', line) if match: width = int(match.group(1)) height = int(match.group(2)) break return width, height # Get dimensions of both videos source_width, source_height = get_video_info(source_video_path) generated_width, generated_height = get_video_info(generated_video_path) if not source_width or not generated_width: print("Error: Could not determine video dimensions") return None print(f"Source video: {source_width}x{source_height}") print(f"Generated video: {generated_width}x{generated_height}") # Calculate target dimensions (maintain aspect ratio) target_height = max(source_height, generated_height) source_target_width = int(source_width * (target_height / source_height)) generated_target_width = int(generated_width * (target_height / generated_height)) # Create a complex filter for side-by-side display with labels filter_complex = ( f"[0:v]scale={source_target_width}:{target_height}[left];" f"[1:v]scale={generated_target_width}:{target_height}[right];" f"[left]drawtext=text='Source':x=({source_target_width}/2-50):y=20:fontsize=24:fontcolor=white:box=1:boxcolor=black@0.5[left_text];" f"[right]drawtext=text='Generated':x=({generated_target_width}/2-70):y=20:fontsize=24:fontcolor=white:box=1:boxcolor=black@0.5[right_text];" f"[left_text][right_text]hstack=inputs=2[v]" ) # Write the filter script to a file with open(filter_script_path, 'w') as f: f.write(filter_complex) # Build the ffmpeg command cmd = [ ffmpeg_exe, "-y", "-i", source_video_path, "-i", generated_video_path, "-filter_complex_script", filter_script_path, "-map", "[v]" ] # Check if source video has audio has_audio_cmd = [ ffmpeg_exe, "-i", source_video_path, "-hide_banner", "-loglevel", "error" ] audio_check = subprocess.run(has_audio_cmd, capture_output=True, text=True) has_audio = "Audio:" in audio_check.stderr if has_audio: cmd.extend(["-map", "0:a"]) # Add output options cmd.extend([ "-c:v", "libx264", "-crf", "18", "-preset", "medium" ]) if has_audio: cmd.extend(["-c:a", "aac"]) cmd.append(output_path) # Run the ffmpeg command print(f"Running ffmpeg command: {' '.join(cmd)}") subprocess.run(cmd, check=True, capture_output=True, text=True) # Clean up the filter script if os.path.exists(filter_script_path): os.remove(filter_script_path) print(f"Combined video saved to {output_path}") return output_path except Exception as e: print(f"Error combining videos: {str(e)}") import traceback traceback.print_exc() return None