File size: 29,183 Bytes
05fcd0f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
import torch
import os
import numpy as np
import math
import decord
from tqdm import tqdm
import pathlib
from PIL import Image

from diffusers_helper.models.hunyuan_video_packed import HunyuanVideoTransformer3DModelPacked
from diffusers_helper.memory import DynamicSwapInstaller
from diffusers_helper.utils import resize_and_center_crop
from diffusers_helper.bucket_tools import find_nearest_bucket
from diffusers_helper.hunyuan import vae_encode, vae_decode
from .base_generator import BaseModelGenerator

class VideoBaseModelGenerator(BaseModelGenerator):
    """
    Model generator for the Video extension of the Original HunyuanVideo model.
    This generator accepts video input instead of a single image.
    """
    
    def __init__(self, **kwargs):
        """
        Initialize the Video model generator.
        """
        super().__init__(**kwargs)
        self.model_name = None # Subclass Model Specific
        self.model_path = None # Subclass Model Specific
        self.model_repo_id_for_cache = None # Subclass Model Specific
        self.full_video_latents = None # For context, set by worker() when available
        self.resolution = 640  # Default resolution
        self.no_resize = False  # Default to resize
        self.vae_batch_size = 16  # Default VAE batch size
        
        # Import decord and tqdm here to avoid import errors if not installed
        try:
            import decord
            from tqdm import tqdm
            self.decord = decord
            self.tqdm = tqdm
        except ImportError:
            print("Warning: decord or tqdm not installed. Video processing will not work.")
            self.decord = None
            self.tqdm = None
    
    def get_model_name(self):
        """
        Get the name of the model.
        """
        return self.model_name
    
    def load_model(self):
        """
        Load the Video transformer model.
        If offline mode is True, attempts to load from a local snapshot.
        """
        print(f"Loading {self.model_name} Transformer...")
        
        path_to_load = self.model_path # Initialize with the default path

        if self.offline:
            path_to_load = self._get_offline_load_path() # Calls the method in BaseModelGenerator
        
        # Create the transformer model
        self.transformer = HunyuanVideoTransformer3DModelPacked.from_pretrained(
            path_to_load, 
            torch_dtype=torch.bfloat16
        ).cpu()
        
        # Configure the model
        self.transformer.eval()
        self.transformer.to(dtype=torch.bfloat16)
        self.transformer.requires_grad_(False)
        
        # Set up dynamic swap if not in high VRAM mode
        if not self.high_vram:
            DynamicSwapInstaller.install_model(self.transformer, device=self.gpu)
        else:
            # In high VRAM mode, move the entire model to GPU
            self.transformer.to(device=self.gpu)
        
        print(f"{self.model_name} Transformer Loaded from {path_to_load}.")
        return self.transformer
    
    def min_real_frames_to_encode(self, real_frames_available_count):
        """
        Minimum number of real frames to encode
        is the maximum number of real frames used for generation context.
        
        The number of latents could be calculated as below for video F1, but keeping it simple for now
        by hardcoding the Video F1 value at max_latents_used_for_context = 27.

        # Calculate the number of latent frames to encode from the end of the input video
        num_frames_to_encode_from_end = 1  # Default minimum
        if model_type == "Video":
            # Max needed is 1 (clean_latent_pre) + 2 (max 2x) + 16 (max 4x) = 19
            num_frames_to_encode_from_end = 19
        elif model_type == "Video F1":
            ui_num_cleaned_frames = job_params.get('num_cleaned_frames', 5)
            # Max effective_clean_frames based on VideoF1ModelGenerator's logic.
            # Max num_clean_frames from UI is 10 (modules/interface.py).
            # Max effective_clean_frames = 10 - 1 = 9.
            # total_context_frames = num_4x_frames + num_2x_frames + effective_clean_frames
            # Max needed = 16 (max 4x) + 2 (max 2x) + 9 (max effective_clean_frames) = 27
            num_frames_to_encode_from_end = 27
        
        Note: 27 latents ~ 108 real frames = 3.6 seconds at 30 FPS.
        Note: 19 latents ~ 76 real frames ~ 2.5 seconds at 30 FPS.
        """

        max_latents_used_for_context = 27
        if self.get_model_name() == "Video":
            max_latents_used_for_context = 27  # Weird results on 19
        elif self.get_model_name() == "Video F1":
            max_latents_used_for_context = 27  # Enough for even Video F1 with cleaned_frames input of 10
        else:
            print("======================================================")
            print(f"    *****    Warning: Unsupported video extension model type: {self.get_model_name()}.")
            print( "    *****    Using default max latents {max_latents_used_for_context} for context.")
            print( "    *****    Please report to the developers if you see this message:")
            print( "    *****    Discord: https://discord.gg/8Z2c3a4 or GitHub: https://github.com/colinurbs/FramePack-Studio")
            print("======================================================")
            # Probably better to press on with Video F1 max vs exception?
            # raise ValueError(f"Unsupported video extension model type: {self.get_model_name()}")

        latent_size_factor = 4 # real frames to latent frames conversion factor
        max_real_frames_used_for_context = max_latents_used_for_context * latent_size_factor

        # Shortest of available frames and max frames used for context
        trimmed_real_frames_count = min(real_frames_available_count, max_real_frames_used_for_context)
        if trimmed_real_frames_count < real_frames_available_count:
            print(f"Truncating video frames from {real_frames_available_count} to {trimmed_real_frames_count}, enough to populate context")

        # Truncate to nearest latent size (multiple of 4)
        frames_to_encode_count = (trimmed_real_frames_count // latent_size_factor) * latent_size_factor
        if frames_to_encode_count != trimmed_real_frames_count:
            print(f"Truncating video frames from {trimmed_real_frames_count} to {frames_to_encode_count}, for latent size compatibility")

        return frames_to_encode_count

    def extract_video_frames(self, is_for_encode, video_path, resolution, no_resize=False, input_files_dir=None):
        """
        Extract real frames from a video, resized and center cropped as numpy array (T, H, W, C).
        
        Args:
            is_for_encode: If True, results are capped at maximum frames used for context, and aligned to 4-frame latent requirement.
            video_path: Path to the input video file.
            resolution: Target resolution for resizing frames.
            no_resize: Whether to use the original video resolution.
            input_files_dir: Directory for input files that won't be cleaned up.
        
        Returns:
            A tuple containing:
            - input_frames_resized_np: All input frames resized and center cropped as numpy array (T, H, W, C)
            - fps: Frames per second of the input video
            - target_height: Target height of the video
            - target_width: Target width of the video
        """
        def time_millis():
            import time
            return time.perf_counter() * 1000.0 # Convert seconds to milliseconds
        
        encode_start_time_millis = time_millis()
           
        # Normalize video path for Windows compatibility
        video_path = str(pathlib.Path(video_path).resolve())
        print(f"Processing video: {video_path}")
        
        # Check if the video is in the temp directory and if we have an input_files_dir
        if input_files_dir and "temp" in video_path:
            # Check if there's a copy of this video in the input_files_dir
            filename = os.path.basename(video_path)
            input_file_path = os.path.join(input_files_dir, filename)
            
            # If the file exists in input_files_dir, use that instead
            if os.path.exists(input_file_path):
                print(f"Using video from input_files_dir: {input_file_path}")
                video_path = input_file_path
            else:
                # If not, copy it to input_files_dir to prevent it from being deleted
                try:
                    from diffusers_helper.utils import generate_timestamp
                    safe_filename = f"{generate_timestamp()}_{filename}"
                    input_file_path = os.path.join(input_files_dir, safe_filename)
                    import shutil
                    shutil.copy2(video_path, input_file_path)
                    print(f"Copied video to input_files_dir: {input_file_path}")
                    video_path = input_file_path
                except Exception as e:
                    print(f"Error copying video to input_files_dir: {e}")

        try:
            # Load video and get FPS
            print("Initializing VideoReader...")
            vr = decord.VideoReader(video_path)
            fps = vr.get_avg_fps()  # Get input video FPS
            num_real_frames = len(vr)
            print(f"Video loaded: {num_real_frames} frames, FPS: {fps}")

            # Read frames
            print("Reading video frames...")

            total_frames_in_video_file = len(vr)
            if is_for_encode:
                print(f"Using minimum real frames to encode: {self.min_real_frames_to_encode(total_frames_in_video_file)}")
                num_real_frames = self.min_real_frames_to_encode(total_frames_in_video_file)
            # else left as all frames -- len(vr) with no regard for trimming or latent alignment

            # RT_BORG: Retaining this commented code for reference.
            # pftq encoder discarded truncated frames from the end of the video.
            # frames = vr.get_batch(range(num_real_frames)).asnumpy()  # Shape: (num_real_frames, height, width, channels)

            # RT_BORG: Retaining this commented code for reference.
            # pftq retained the entire encoded video.
            # Truncate to nearest latent size (multiple of 4)
            # latent_size_factor = 4
            # num_frames = (num_real_frames // latent_size_factor) * latent_size_factor
            # if num_frames != num_real_frames:
            #     print(f"Truncating video from {num_real_frames} to {num_frames} frames for latent size compatibility")
            # num_real_frames = num_frames

            # Discard truncated frames from the beginning of the video, retaining the last num_real_frames
            # This ensures a smooth transition from the input video to the generated video
            start_frame_index = total_frames_in_video_file - num_real_frames
            frame_indices_to_extract = range(start_frame_index, total_frames_in_video_file)
            frames = vr.get_batch(frame_indices_to_extract).asnumpy()  # Shape: (num_real_frames, height, width, channels)

            print(f"Frames read: {frames.shape}")

            # Get native video resolution
            native_height, native_width = frames.shape[1], frames.shape[2]
            print(f"Native video resolution: {native_width}x{native_height}")
        
            # Use native resolution if height/width not specified, otherwise use provided values
            target_height = native_height
            target_width = native_width
        
            # Adjust to nearest bucket for model compatibility
            if not no_resize:
                target_height, target_width = find_nearest_bucket(target_height, target_width, resolution=resolution)
                print(f"Adjusted resolution: {target_width}x{target_height}")
            else:
                print(f"Using native resolution without resizing: {target_width}x{target_height}")

            # Preprocess input frames to match desired resolution
            input_frames_resized_np = []
            for i, frame in tqdm(enumerate(frames), desc="Processing Video Frames", total=num_real_frames, mininterval=0.1):
                frame_np = resize_and_center_crop(frame, target_width=target_width, target_height=target_height)
                input_frames_resized_np.append(frame_np)
            input_frames_resized_np = np.stack(input_frames_resized_np)  # Shape: (num_real_frames, height, width, channels)
            print(f"Frames preprocessed: {input_frames_resized_np.shape}")

            resized_frames_time_millis = time_millis()
            if (False): # We really need a logger
                print("======================================================")
                memory_bytes = input_frames_resized_np.nbytes
                memory_kb = memory_bytes / 1024
                memory_mb = memory_kb / 1024
                print(f"    *****    input_frames_resized_np: {input_frames_resized_np.shape}")
                print(f"    *****    Memory usage: {int(memory_mb)} MB")
                duration_ms = resized_frames_time_millis - encode_start_time_millis
                print(f"    *****    Time taken to process frames tensor: {duration_ms / 1000.0:.2f} seconds")
                print("======================================================")

            return input_frames_resized_np, fps, target_height, target_width
        except Exception as e:
            print(f"Error in extract_video_frames: {str(e)}")
            raise

    # RT_BORG: video_encode produce and return end_of_input_video_latent and end_of_input_video_image_np
    # which are not needed for Video models without end frame processing.
    # But these should be inexpensive and it's easier to keep the code uniform.
    @torch.no_grad()
    def video_encode(self, video_path, resolution, no_resize=False, vae_batch_size=16, device=None, input_files_dir=None):
        """
        Encode a video into latent representations using the VAE.
        
        Args:
            video_path: Path to the input video file.
            resolution: Target resolution for resizing frames.
            no_resize: Whether to use the original video resolution.
            vae_batch_size: Number of frames to process per batch.
            device: Device for computation (e.g., "cuda").
            input_files_dir: Directory for input files that won't be cleaned up.
        
        Returns:
            A tuple containing:
            - start_latent: Latent of the first frame
            - input_image_np: First frame as numpy array
            - history_latents: Latents of all frames
            - fps: Frames per second of the input video
            - target_height: Target height of the video
            - target_width: Target width of the video
            - input_video_pixels: Video frames as tensor
            - end_of_input_video_image_np: Last frame as numpy array
            - input_frames_resized_np: All input frames resized and center cropped as numpy array (T, H, W, C)
        """
        encoding = True  # Flag to indicate this is for encoding
        input_frames_resized_np, fps, target_height, target_width = self.extract_video_frames(encoding, video_path, resolution, no_resize, input_files_dir)

        try:
            if device is None:
                device = self.gpu
                
            # Check CUDA availability and fallback to CPU if needed
            if device == "cuda" and not torch.cuda.is_available():
                print("CUDA is not available, falling back to CPU")
                device = "cpu"

            # Save first frame for CLIP vision encoding
            input_image_np = input_frames_resized_np[0]
            end_of_input_video_image_np = input_frames_resized_np[-1]

            # Convert to tensor and normalize to [-1, 1]
            print("Converting frames to tensor...")
            frames_pt = torch.from_numpy(input_frames_resized_np).float() / 127.5 - 1
            frames_pt = frames_pt.permute(0, 3, 1, 2)  # Shape: (num_real_frames, channels, height, width)
            frames_pt = frames_pt.unsqueeze(0)  # Shape: (1, num_real_frames, channels, height, width)
            frames_pt = frames_pt.permute(0, 2, 1, 3, 4)  # Shape: (1, channels, num_real_frames, height, width)
            print(f"Tensor shape: {frames_pt.shape}")
            
            # Save pixel frames for use in worker
            input_video_pixels = frames_pt.cpu()

            # Move to device
            print(f"Moving tensor to device: {device}")
            frames_pt = frames_pt.to(device)
            print("Tensor moved to device")

            # Move VAE to device
            print(f"Moving VAE to device: {device}")
            self.vae.to(device)
            print("VAE moved to device")

            # Encode frames in batches
            print(f"Encoding input video frames in VAE batch size {vae_batch_size}")
            latents = []
            self.vae.eval()
            with torch.no_grad():
                frame_count = frames_pt.shape[2]
                step_count = math.ceil(frame_count / vae_batch_size)
                for i in tqdm(range(0, frame_count, vae_batch_size), desc="Encoding video frames", total=step_count, mininterval=0.1):
                    batch = frames_pt[:, :, i:i + vae_batch_size]  # Shape: (1, channels, batch_size, height, width)
                    try:
                        # Log GPU memory before encoding
                        if device == "cuda":
                            free_mem = torch.cuda.memory_allocated() / 1024**3
                        batch_latent = vae_encode(batch, self.vae)
                        # Synchronize CUDA to catch issues
                        if device == "cuda":
                            torch.cuda.synchronize()
                        latents.append(batch_latent)
                    except RuntimeError as e:
                        print(f"Error during VAE encoding: {str(e)}")
                        if device == "cuda" and "out of memory" in str(e).lower():
                            print("CUDA out of memory, try reducing vae_batch_size or using CPU")
                        raise
            
            # Concatenate latents
            print("Concatenating latents...")
            history_latents = torch.cat(latents, dim=2)  # Shape: (1, channels, frames, height//8, width//8)
            print(f"History latents shape: {history_latents.shape}")

            # Get first frame's latent
            start_latent = history_latents[:, :, :1]  # Shape: (1, channels, 1, height//8, width//8)
            print(f"Start latent shape: {start_latent.shape}")

            if (False): # We really need a logger
                print("======================================================")
                memory_bytes = history_latents.nbytes
                memory_kb = memory_bytes / 1024
                memory_mb = memory_kb / 1024
                print(f"    *****    history_latents: {history_latents.shape}")
                print(f"    *****    Memory usage: {int(memory_mb)} MB")
                print("======================================================")

            # Move VAE back to CPU to free GPU memory
            if device == "cuda":
                self.vae.to(self.cpu)
                torch.cuda.empty_cache()
                print("VAE moved back to CPU, CUDA cache cleared")

            return start_latent, input_image_np, history_latents, fps, target_height, target_width, input_video_pixels, end_of_input_video_image_np, input_frames_resized_np

        except Exception as e:
            print(f"Error in video_encode: {str(e)}")
            raise
    
    # RT_BORG: Currently history_latents is initialized within worker() for all Video models as history_latents = video_latents
    # So it is a coding error to call prepare_history_latents() here.
    # Leaving in place as we will likely use it post-refactoring.
    def prepare_history_latents(self, height, width):
        """
        Prepare the history latents tensor for the Video model.
        
        Args:
            height: The height of the image
            width: The width of the image
            
        Returns:
            The initialized history latents tensor
        """
        raise TypeError(
            f"Error: '{self.__class__.__name__}.prepare_history_latents' should not be called "
            "on the Video models. history_latents should be initialized within worker() for all Video models "
            "as history_latents = video_latents."
        )

    def prepare_indices(self, latent_padding_size, latent_window_size):
        """
        Prepare the indices for the Video model.
        
        Args:
            latent_padding_size: The size of the latent padding
            latent_window_size: The size of the latent window
            
        Returns:
            A tuple of (clean_latent_indices, latent_indices, clean_latent_2x_indices, clean_latent_4x_indices)
        """
        raise TypeError(
            f"Error: '{self.__class__.__name__}.prepare_indices' should not be called "
            "on the Video models. Currently video models each have a combined method: <model>_prepare_clean_latents_and_indices."
        )

    def set_full_video_latents(self, video_latents):
        """
        Set the full video latents for context.
        
        Args:
            video_latents: The full video latents
        """
        self.full_video_latents = video_latents
    
    def prepare_clean_latents(self, start_latent, history_latents):
        """
        Prepare the clean latents for the Video model.
        
        Args:
            start_latent: The start latent
            history_latents: The history latents
            
        Returns:
            A tuple of (clean_latents, clean_latents_2x, clean_latents_4x)
        """
        raise TypeError(
            f"Error: '{self.__class__.__name__}.prepare_indices' should not be called "
            "on the Video models. Currently video models each have a combined method: <model>_prepare_clean_latents_and_indices."
        )
    
    def get_section_latent_frames(self, latent_window_size, is_last_section):
        """
        Get the number of section latent frames for the Video model.
        
        Args:
            latent_window_size: The size of the latent window
            is_last_section: Whether this is the last section
            
        Returns:
            The number of section latent frames
        """
        return latent_window_size * 2
        
    def combine_videos(self, source_video_path, generated_video_path, output_path):
        """
        Combine the source video with the generated video side by side.
        
        Args:
            source_video_path: Path to the source video
            generated_video_path: Path to the generated video
            output_path: Path to save the combined video
            
        Returns:
            Path to the combined video
        """
        try:
            import os
            import subprocess
            
            print(f"Combining source video {source_video_path} with generated video {generated_video_path}")
            
            # Get the ffmpeg executable from the VideoProcessor class
            from modules.toolbox.toolbox_processor import VideoProcessor
            from modules.toolbox.message_manager import MessageManager
            
            # Create a message manager for logging
            message_manager = MessageManager()
            
            # Import settings from main module
            try:
                from __main__ import settings
                video_processor = VideoProcessor(message_manager, settings.settings)
            except ImportError:
                # Fallback to creating a new settings object
                from modules.settings import Settings
                settings = Settings()
                video_processor = VideoProcessor(message_manager, settings.settings)
            
            # Get the ffmpeg executable
            ffmpeg_exe = video_processor.ffmpeg_exe
            
            if not ffmpeg_exe:
                print("FFmpeg executable not found. Cannot combine videos.")
                return None
            
            print(f"Using ffmpeg at: {ffmpeg_exe}")
            
            # Create a temporary directory for the filter script
            import tempfile
            temp_dir = tempfile.gettempdir()
            filter_script_path = os.path.join(temp_dir, f"filter_script_{os.path.basename(output_path)}.txt")
            
            # Get video dimensions using ffprobe
            def get_video_info(video_path):
                cmd = [
                    ffmpeg_exe, "-i", video_path, 
                    "-hide_banner", "-loglevel", "error"
                ]
                
                # Run ffmpeg to get video info (it will fail but output info to stderr)
                result = subprocess.run(cmd, capture_output=True, text=True)
                
                # Parse the output to get dimensions
                width = height = None
                for line in result.stderr.split('\n'):
                    if 'Video:' in line:
                        # Look for dimensions like 640x480
                        import re
                        match = re.search(r'(\d+)x(\d+)', line)
                        if match:
                            width = int(match.group(1))
                            height = int(match.group(2))
                            break
                
                return width, height
            
            # Get dimensions of both videos
            source_width, source_height = get_video_info(source_video_path)
            generated_width, generated_height = get_video_info(generated_video_path)
            
            if not source_width or not generated_width:
                print("Error: Could not determine video dimensions")
                return None
            
            print(f"Source video: {source_width}x{source_height}")
            print(f"Generated video: {generated_width}x{generated_height}")
            
            # Calculate target dimensions (maintain aspect ratio)
            target_height = max(source_height, generated_height)
            source_target_width = int(source_width * (target_height / source_height))
            generated_target_width = int(generated_width * (target_height / generated_height))
            
            # Create a complex filter for side-by-side display with labels
            filter_complex = (
                f"[0:v]scale={source_target_width}:{target_height}[left];"
                f"[1:v]scale={generated_target_width}:{target_height}[right];"
                f"[left]drawtext=text='Source':x=({source_target_width}/2-50):y=20:fontsize=24:fontcolor=white:box=1:boxcolor=black@0.5[left_text];"
                f"[right]drawtext=text='Generated':x=({generated_target_width}/2-70):y=20:fontsize=24:fontcolor=white:box=1:boxcolor=black@0.5[right_text];"
                f"[left_text][right_text]hstack=inputs=2[v]"
            )
            
            # Write the filter script to a file
            with open(filter_script_path, 'w') as f:
                f.write(filter_complex)
            
            # Build the ffmpeg command
            cmd = [
                ffmpeg_exe, "-y",
                "-i", source_video_path,
                "-i", generated_video_path,
                "-filter_complex_script", filter_script_path,
                "-map", "[v]"
            ]
            
            # Check if source video has audio
            has_audio_cmd = [
                ffmpeg_exe, "-i", source_video_path,
                "-hide_banner", "-loglevel", "error"
            ]
            audio_check = subprocess.run(has_audio_cmd, capture_output=True, text=True)
            has_audio = "Audio:" in audio_check.stderr
            
            if has_audio:
                cmd.extend(["-map", "0:a"])
            
            # Add output options
            cmd.extend([
                "-c:v", "libx264",
                "-crf", "18",
                "-preset", "medium"
            ])
            
            if has_audio:
                cmd.extend(["-c:a", "aac"])
            
            cmd.append(output_path)
            
            # Run the ffmpeg command
            print(f"Running ffmpeg command: {' '.join(cmd)}")
            subprocess.run(cmd, check=True, capture_output=True, text=True)
            
            # Clean up the filter script
            if os.path.exists(filter_script_path):
                os.remove(filter_script_path)
            
            print(f"Combined video saved to {output_path}")
            return output_path
            
        except Exception as e:
            print(f"Error combining videos: {str(e)}")
            import traceback
            traceback.print_exc()
            return None