Spaces:

mgbam
/

CingenAI

Running

App Files Files Community

mgbam commited on 18 days ago

Commit

990e23e

verified ·

1 Parent(s): 48777e5

Update core/visual_engine.py

Browse files

Files changed (1) hide show

core/visual_engine.py +82 -73

core/visual_engine.py CHANGED Viewed

@@ -1,9 +1,9 @@
 # core/visual_engine.py
-from PIL import Image, ImageDraw, ImageFont
 from moviepy.editor import (ImageClip, concatenate_videoclips, TextClip,
-                            CompositeVideoClip, vfx) # Added vfx for effects
-import moviepy.video.fx.all as vfx # More explicit import for resize
-import numpy as np # For converting PIL images to numpy arrays for moviepy
 import os
 import openai
 import requests
@@ -16,12 +16,12 @@ class VisualEngine:
         self.font_filename = "arial.ttf"
         self.font_path_in_container = f"/usr/local/share/fonts/truetype/mycustomfonts/{self.font_filename}"
-        self.font_size_pil = 24 # For placeholder images
-        self.video_overlay_font_size = 36 # For text overlays on video
         self.video_overlay_font_color = 'white'
-        # For video overlays, try to use a system font that moviepy/ImageMagick can find
-        # Or provide a path to a .ttf file for TextClip's font parameter
-        self.video_overlay_font = 'Arial' # Generic name, ImageMagick might find it. Or use self.font_path_in_container
         try:
             self.font = ImageFont.truetype(self.font_path_in_container, self.font_size_pil)
@@ -34,16 +34,14 @@ class VisualEngine:
         self.openai_api_key = None
         self.USE_AI_IMAGE_GENERATION = False
         self.dalle_model = "dall-e-3"
-        self.image_size = "1024x1024"
-        # For DALL-E 3, you might want a slightly larger video frame to accommodate 1024x1024 images
-        self.video_frame_size = (1024, 576) # 16:9, DALL-E images will be letterboxed or cropped if not 16:9.
-                                         # Or (1024,1024) if you want square video frames.
     def set_openai_api_key(self, api_key):
-        # ... (remains the same) ...
         if api_key:
             self.openai_api_key = api_key
-            # openai.api_key = self.openai_api_key # Older versions. New client takes key per call.
             self.USE_AI_IMAGE_GENERATION = True
             print("OpenAI API key set. AI Image Generation Enabled with DALL-E.")
         else:
@@ -51,16 +49,15 @@ class VisualEngine:
             print("OpenAI API key not provided. AI Image Generation Disabled. Using placeholders.")
     def _get_text_dimensions(self, text_content, font_obj):
-        # ... (remains the same) ...
         if text_content == "" or text_content is None:
             return 0, self.font_size_pil
         try:
-            if hasattr(font_obj, 'getbbox'):
                 bbox = font_obj.getbbox(text_content)
                 width = bbox[2] - bbox[0]
                 height = bbox[3] - bbox[1]
                 return width, height if height > 0 else self.font_size_pil
-            elif hasattr(font_obj, 'getsize'):
                 width, height = font_obj.getsize(text_content)
                 return width, height if height > 0 else self.font_size_pil
             else:
@@ -73,8 +70,7 @@ class VisualEngine:
             height_estimate = self.font_size_pil * 1.2
             return int(len(text_content) * avg_char_width), int(height_estimate if height_estimate > 0 else self.font_size_pil)
-    def _create_placeholder_image_content(self, text_description, filename, size=(1024, 576)):
-        # ... (remains the same) ...
         img = Image.new('RGB', size, color=(30, 30, 60))
         draw = ImageDraw.Draw(img)
         padding = 30
@@ -122,7 +118,6 @@ class VisualEngine:
         return filepath
     def generate_image_visual(self, image_prompt_text, scene_identifier_filename):
-        # ... (DALL-E logic remains the same, including fallback to _create_placeholder_image_content) ...
         filepath = os.path.join(self.output_dir, scene_identifier_filename)
         if self.USE_AI_IMAGE_GENERATION and self.openai_api_key:
             try:
@@ -130,19 +125,21 @@ class VisualEngine:
                 client = openai.OpenAI(api_key=self.openai_api_key)
                 response = client.images.generate(
                     model=self.dalle_model, prompt=image_prompt_text, n=1,
-                    size=self.image_size, quality="standard", response_format="url"
                 )
                 image_url = response.data[0].url
-                revised_prompt_dalle3 = response.data[0].revised_prompt
                 if revised_prompt_dalle3: print(f"DALL-E 3 revised prompt: {revised_prompt_dalle3[:150]}...")
-                image_response = requests.get(image_url, timeout=60) # Increased timeout for image download
                 image_response.raise_for_status()
-                img_data = Image.open(io.BytesIO(image_response.content))
-                # Ensure image is RGB before saving as PNG (some APIs might return RGBA)
-                if img_data.mode == 'RGBA':
                     img_data = img_data.convert('RGB')
                 img_data.save(filepath)
                 print(f"AI Image (DALL-E) saved: {filepath}")
                 return filepath
@@ -152,23 +149,21 @@ class VisualEngine:
                 print(f"Requests Error downloading DALL-E image: {e}")
             except Exception as e:
                 print(f"Generic error during DALL-E image generation: {e}")
             print("Falling back to placeholder image due to DALL-E error.")
             return self._create_placeholder_image_content(
                 f"[DALL-E Failed] Prompt: {image_prompt_text[:150]}...",
-                scene_identifier_filename, size=self.video_frame_size # Use video frame size for placeholder
             )
-        else:
             return self._create_placeholder_image_content(
                 image_prompt_text, scene_identifier_filename, size=self.video_frame_size
             )
     def create_video_from_images(self, image_data_list, output_filename="final_video.mp4", fps=24, duration_per_image=3):
-        """
-        Creates a video from a list of image file paths and associated text.
-        image_data_list: List of dictionaries, each like:
-                         {'path': 'path/to/image.png', 'scene_num': 1, 'key_action': 'Some action'}
-        """
         if not image_data_list:
             print("No image data provided to create video.")
             return None
@@ -185,44 +180,55 @@ class VisualEngine:
                 print(f"Image path invalid or not found: {img_path}. Skipping for video.")
                 continue
             try:
-                # Load image and resize to fit video_frame_size, maintaining aspect ratio (letterbox/pillarbox)
-                pil_image = Image.open(img_path)
-                pil_image.thumbnail(self.video_frame_size, Image.Resampling.LANCZOS) # Resize in place
-                # Create a background matching video_frame_size
-                background = Image.new('RGB', self.video_frame_size, (0,0,0)) # Black background
-                # Paste the thumbnail onto the center of the background
-                paste_x = (self.video_frame_size[0] - pil_image.width) // 2
-                paste_y = (self.video_frame_size[1] - pil_image.height) // 2
-                background.paste(pil_image, (paste_x, paste_y))
-                # Convert PIL image to numpy array for MoviePy
-                frame_np = np.array(background)
                 img_clip = ImageClip(frame_np).set_duration(duration_per_image)
-                # Simple Ken Burns effect (zoom in slightly)
-                # End scale (e.g., 1.1 = 10% zoom in). Adjust for desired effect.
-                end_scale = 1.05
-                img_clip = img_clip.fx(vfx.resize, lambda t: 1 + (end_scale-1) * (t / duration_per_image) )
-                # To keep it centered while zooming:
-                img_clip = img_clip.set_position('center')
-                # Add Text Overlay for Scene Number and Key Action
-                overlay_text = f"Scene {scene_num}\n{key_action}"
-                txt_clip = TextClip(overlay_text, fontsize=self.video_overlay_font_size,
-                                    color=self.video_overlay_font_color,
-                                    font=self.video_overlay_font, # Ensure this font is findable by ImageMagick
-                                    bg_color='rgba(0,0,0,0.5)', # Semi-transparent black background
-                                    size=(img_clip.w * 0.9, None), # Width 90% of image, height auto
-                                    method='caption', # Auto-wrap text
-                                    align='West', # Left align
-                                    kerning=-1
-                                    ).set_duration(duration_per_image - 0.5).set_start(0.25) # Show for most of duration
-                txt_clip = txt_clip.set_position(('center', 0.85), relative=True) # Position at 85% from top, centered
-                # Composite the image and text
                 video_with_text_overlay = CompositeVideoClip([img_clip, txt_clip], size=self.video_frame_size)
                 processed_clips.append(video_with_text_overlay)
@@ -233,9 +239,11 @@ class VisualEngine:
             print("No clips could be processed for the video.")
             return None
-        # Concatenate with crossfade transitions
-        final_video_clip = concatenate_videoclips(processed_clips, padding=-0.5, method="compose").fx(vfx.fadein, 0.5).fx(vfx.fadeout, 0.5)
-        # padding = -0.5 means 0.5s crossfade. Requires method="compose"
         output_path = os.path.join(self.output_dir, output_filename)
         print(f"Writing final video to: {output_path}")
@@ -250,6 +258,7 @@ class VisualEngine:
         except Exception as e:
             print(f"Error writing final video file: {e}")
             return None
-        finally: # Ensure clips are closed
-            for clip in processed_clips: clip.close()
             if hasattr(final_video_clip, 'close'): final_video_clip.close()

 # core/visual_engine.py
+from PIL import Image, ImageDraw, ImageFont # Pillow should be >= 10.0.0
 from moviepy.editor import (ImageClip, concatenate_videoclips, TextClip,
+                            CompositeVideoClip)
+import moviepy.video.fx.all as vfx # For effects like resize, fadein, fadeout
+import numpy as np
 import os
 import openai
 import requests
         self.font_filename = "arial.ttf"
         self.font_path_in_container = f"/usr/local/share/fonts/truetype/mycustomfonts/{self.font_filename}"
+        self.font_size_pil = 24
+        self.video_overlay_font_size = 36
         self.video_overlay_font_color = 'white'
+        # For video overlays, TextClip will use ImageMagick. 'Arial' is a common system font name.
+        # If issues, use self.font_path_in_container (if ImageMagick can access it via moviepy)
+        self.video_overlay_font = 'Arial'
         try:
             self.font = ImageFont.truetype(self.font_path_in_container, self.font_size_pil)
         self.openai_api_key = None
         self.USE_AI_IMAGE_GENERATION = False
         self.dalle_model = "dall-e-3"
+        self.image_size = "1024x1024" # DALL-E 3 output size
+        # Target video frame size (e.g., 16:9 aspect ratio)
+        # DALL-E 3 images (1024x1024) will be letter/pillar-boxed to fit this.
+        self.video_frame_size = (1280, 720)
     def set_openai_api_key(self, api_key):
         if api_key:
             self.openai_api_key = api_key
             self.USE_AI_IMAGE_GENERATION = True
             print("OpenAI API key set. AI Image Generation Enabled with DALL-E.")
         else:
             print("OpenAI API key not provided. AI Image Generation Disabled. Using placeholders.")
     def _get_text_dimensions(self, text_content, font_obj):
         if text_content == "" or text_content is None:
             return 0, self.font_size_pil
         try:
+            if hasattr(font_obj, 'getbbox'): # Pillow >= 8.0.0
                 bbox = font_obj.getbbox(text_content)
                 width = bbox[2] - bbox[0]
                 height = bbox[3] - bbox[1]
                 return width, height if height > 0 else self.font_size_pil
+            elif hasattr(font_obj, 'getsize'): # Older Pillow
                 width, height = font_obj.getsize(text_content)
                 return width, height if height > 0 else self.font_size_pil
             else:
             height_estimate = self.font_size_pil * 1.2
             return int(len(text_content) * avg_char_width), int(height_estimate if height_estimate > 0 else self.font_size_pil)
+    def _create_placeholder_image_content(self, text_description, filename, size=(1024, 576)): # Default placeholder size
         img = Image.new('RGB', size, color=(30, 30, 60))
         draw = ImageDraw.Draw(img)
         padding = 30
         return filepath
     def generate_image_visual(self, image_prompt_text, scene_identifier_filename):
         filepath = os.path.join(self.output_dir, scene_identifier_filename)
         if self.USE_AI_IMAGE_GENERATION and self.openai_api_key:
             try:
                 client = openai.OpenAI(api_key=self.openai_api_key)
                 response = client.images.generate(
                     model=self.dalle_model, prompt=image_prompt_text, n=1,
+                    size=self.image_size, quality="standard", response_format="url"
+                    # style="vivid" # or "natural" for DALL-E 3, optional
                 )
                 image_url = response.data[0].url
+                revised_prompt_dalle3 = getattr(response.data[0], 'revised_prompt', None) # Safely access
                 if revised_prompt_dalle3: print(f"DALL-E 3 revised prompt: {revised_prompt_dalle3[:150]}...")
+                image_response = requests.get(image_url, timeout=60)
                 image_response.raise_for_status()
+                img_data = Image.open(io.BytesIO(image_response.content))
+                if img_data.mode == 'RGBA': # Ensure RGB for consistency, PNG can be RGBA
                     img_data = img_data.convert('RGB')
+                # Save the AI generated image (typically 1024x1024 from DALL-E)
                 img_data.save(filepath)
                 print(f"AI Image (DALL-E) saved: {filepath}")
                 return filepath
                 print(f"Requests Error downloading DALL-E image: {e}")
             except Exception as e:
                 print(f"Generic error during DALL-E image generation: {e}")
             print("Falling back to placeholder image due to DALL-E error.")
+            # Fallback uses video_frame_size to match what video expects if AI fails
             return self._create_placeholder_image_content(
                 f"[DALL-E Failed] Prompt: {image_prompt_text[:150]}...",
+                scene_identifier_filename, size=self.video_frame_size
             )
+        else: # AI not enabled or key missing
+            # print(f"AI image generation not enabled/ready. Creating placeholder.")
+            # Placeholder also uses video_frame_size for consistency in video pipeline
             return self._create_placeholder_image_content(
                 image_prompt_text, scene_identifier_filename, size=self.video_frame_size
             )
     def create_video_from_images(self, image_data_list, output_filename="final_video.mp4", fps=24, duration_per_image=3):
         if not image_data_list:
             print("No image data provided to create video.")
             return None
                 print(f"Image path invalid or not found: {img_path}. Skipping for video.")
                 continue
             try:
+                pil_image_original = Image.open(img_path)
+                if pil_image_original.mode != 'RGB': # Ensure RGB for video
+                    pil_image_original = pil_image_original.convert('RGB')
+                # Create a copy to resize (thumbnail modifies in-place)
+                pil_image_for_frame = pil_image_original.copy()
+                # Resize image to fit within self.video_frame_size, maintaining aspect ratio
+                pil_image_for_frame.thumbnail(self.video_frame_size, Image.Resampling.LANCZOS)
+                # Create a background canvas of the exact video_frame_size (e.g., 1280x720)
+                # This will letterbox/pillarbox the image if its aspect ratio differs from video_frame_size
+                background_canvas = Image.new('RGB', self.video_frame_size, (0,0,0)) # Black background
+                paste_x = (self.video_frame_size[0] - pil_image_for_frame.width) // 2
+                paste_y = (self.video_frame_size[1] - pil_image_for_frame.height) // 2
+                background_canvas.paste(pil_image_for_frame, (paste_x, paste_y))
+                frame_np = np.array(background_canvas) # Convert final PIL image to numpy array
+                # Base image clip
                 img_clip = ImageClip(frame_np).set_duration(duration_per_image)
+                # Ken Burns Effect (Simple Zoom In)
+                end_scale = 1.08 # Zoom to 108% of original size by the end
+                img_clip = img_clip.fx(vfx.resize, lambda t: 1 + (end_scale - 1) * (t / duration_per_image))
+                img_clip = img_clip.set_position('center') # Keep centered during zoom
+                # Text Overlay
+                overlay_text = f"Scene {scene_num}: {key_action}"
+                # Ensure font path is used if 'Arial' isn't found by ImageMagick/MoviePy
+                # For TextClip, moviepy relies on ImageMagick which has its own font discovery.
+                # Using a common font name like 'Arial' is often okay if mscorefonts are installed.
+                # If not, you might need to point to self.font_path_in_container
+                # Check if ImageMagick is installed in Docker, moviepy might need it for TextClip.
+                # `apt-get install imagemagick` in Dockerfile if TextClip has issues.
+                txt_clip = TextClip(
+                    overlay_text,
+                    fontsize=self.video_overlay_font_size,
+                    color=self.video_overlay_font_color,
+                    font=self.video_overlay_font, # Or self.font_path_in_container
+                    bg_color='rgba(0,0,0,0.6)',
+                    size=(self.video_frame_size[0] * 0.9, None), # Width 90% of video, height auto
+                    method='caption',
+                    align='West',
+                    kerning=-1
+                ).set_duration(duration_per_image - 0.5).set_start(0.25) # Start after 0.25s, end 0.25s before clip end
+                txt_clip = txt_clip.set_position(('center', 0.88), relative=True) # Position near bottom
                 video_with_text_overlay = CompositeVideoClip([img_clip, txt_clip], size=self.video_frame_size)
                 processed_clips.append(video_with_text_overlay)
             print("No clips could be processed for the video.")
             return None
+        # Concatenate with crossfade (0.5s)
+        final_video_clip = concatenate_videoclips(processed_clips, padding=-0.5, method="compose")
+        # Add fade in/out for the whole video
+        if final_video_clip.duration > 1: # Ensure video is long enough for fades
+            final_video_clip = final_video_clip.fx(vfx.fadein, 0.5).fx(vfx.fadeout, 0.5)
         output_path = os.path.join(self.output_dir, output_filename)
         print(f"Writing final video to: {output_path}")
         except Exception as e:
             print(f"Error writing final video file: {e}")
             return None
+        finally:
+            for clip_item in processed_clips:
+                if hasattr(clip_item, 'close'): clip_item.close()
             if hasattr(final_video_clip, 'close'): final_video_clip.close()