Spaces:
Runtime error
Runtime error
Upload 8 files
Browse files- cinematic_planning.py +100 -0
- evaluation.py +124 -0
- generation.py +230 -0
- main.py +169 -0
- prompt_template_control.py +48 -0
- requirements.txt +10 -0
- selected_storyboards.json +6 -0
- storyboard.py +85 -0
cinematic_planning.py
ADDED
@@ -0,0 +1,100 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import os
|
3 |
+
from dotenv import load_dotenv
|
4 |
+
from openai import OpenAI
|
5 |
+
|
6 |
+
from generation import generate_video
|
7 |
+
|
8 |
+
# Load env for OpenAI
|
9 |
+
load_dotenv()
|
10 |
+
client = OpenAI()
|
11 |
+
|
12 |
+
|
13 |
+
def storyboard_to_pseudo_video(storyboard):
|
14 |
+
return {
|
15 |
+
"scene": storyboard["scene"],
|
16 |
+
"characters": [
|
17 |
+
{
|
18 |
+
"id": "main",
|
19 |
+
"emoji": "👧",
|
20 |
+
"action": "walk",
|
21 |
+
"path": "left_to_right",
|
22 |
+
"emotion": storyboard["emotion"]
|
23 |
+
}
|
24 |
+
],
|
25 |
+
"duration_sec": 5,
|
26 |
+
"camera": storyboard["shot_type"]
|
27 |
+
}
|
28 |
+
|
29 |
+
|
30 |
+
# Generate natural language transition description
|
31 |
+
def generate_transition_description(previous_state, next_state, i):
|
32 |
+
# You can replace this with GPT for smarter descriptions
|
33 |
+
return f"Transition {i+1}: The character continues to walk through the {next_state['scene']} with a {next_state['characters'][0]['emotion']} expression."
|
34 |
+
|
35 |
+
|
36 |
+
# Convert pseudo-video spec to text prompt
|
37 |
+
def pseudo_video_to_prompt(pseudo_video):
|
38 |
+
scene = pseudo_video["scene"]
|
39 |
+
emotion = pseudo_video["characters"][0]["emotion"]
|
40 |
+
camera = pseudo_video["camera"]
|
41 |
+
action = pseudo_video["characters"][0]["action"]
|
42 |
+
path = pseudo_video["characters"][0]["path"]
|
43 |
+
duration = pseudo_video["duration_sec"]
|
44 |
+
|
45 |
+
prompt = (
|
46 |
+
f"Create a {duration}-second video showing a {emotion} scene in a {scene}. "
|
47 |
+
f"A character (represented by emoji) performs the action '{action}' across the screen from {path.replace('_', ' ')}. "
|
48 |
+
f"Use a {camera} to capture the atmosphere."
|
49 |
+
)
|
50 |
+
return prompt
|
51 |
+
|
52 |
+
|
53 |
+
# Iterative Process
|
54 |
+
def build_scene_sequence(storyboard, model_id, num_keyframes=12):
|
55 |
+
pseudo_video = storyboard_to_pseudo_video(storyboard)
|
56 |
+
print("Pseudo-Video Spec:\n", json.dumps(pseudo_video, indent=2))
|
57 |
+
|
58 |
+
previous_state = pseudo_video
|
59 |
+
scene_sequence = []
|
60 |
+
|
61 |
+
for i in range(num_keyframes):
|
62 |
+
# 1️⃣ Generate transition text
|
63 |
+
transition_text = generate_transition_description(previous_state, pseudo_video, i)
|
64 |
+
|
65 |
+
# 2️⃣ Generate video prompt
|
66 |
+
video_prompt = pseudo_video_to_prompt(pseudo_video)
|
67 |
+
|
68 |
+
# 3️⃣ Generate video clip
|
69 |
+
video_path = generate_video(video_prompt, model_id)
|
70 |
+
|
71 |
+
# 4️⃣ Save this step
|
72 |
+
scene_sequence.append({
|
73 |
+
"transition_text": transition_text,
|
74 |
+
"prompt": video_prompt,
|
75 |
+
"video_path": video_path
|
76 |
+
})
|
77 |
+
|
78 |
+
# Optional: Update pseudo_video for next iteration if needed
|
79 |
+
# Example: character moves deeper, emotion changes, etc.
|
80 |
+
|
81 |
+
return scene_sequence
|
82 |
+
|
83 |
+
|
84 |
+
if __name__ == "__main__":
|
85 |
+
os.makedirs("output", exist_ok=True)
|
86 |
+
|
87 |
+
# Example storyboard
|
88 |
+
storyboard = {
|
89 |
+
"scene": "misty forest",
|
90 |
+
"shot_type": "wide shot",
|
91 |
+
"emotion": "mysterious"
|
92 |
+
}
|
93 |
+
|
94 |
+
scene_sequence = build_scene_sequence(storyboard, model_id="Veo-2", num_keyframes=3)
|
95 |
+
|
96 |
+
print("\n--- Final Scene Sequence ---")
|
97 |
+
for i, step in enumerate(scene_sequence):
|
98 |
+
print(f"\nKeyframe {i+1}:")
|
99 |
+
print("Transition:", step["transition_text"])
|
100 |
+
print("Video:", step["video_path"])
|
evaluation.py
ADDED
@@ -0,0 +1,124 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import numpy as np
|
3 |
+
import cv2
|
4 |
+
|
5 |
+
from dotenv import load_dotenv
|
6 |
+
from openai import OpenAI
|
7 |
+
from transformers import CLIPProcessor, CLIPModel
|
8 |
+
from PIL import Image
|
9 |
+
from skimage.metrics import structural_similarity as ssim
|
10 |
+
|
11 |
+
|
12 |
+
# Load env for OpenAI
|
13 |
+
load_dotenv()
|
14 |
+
client = OpenAI()
|
15 |
+
|
16 |
+
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
|
17 |
+
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
|
18 |
+
|
19 |
+
|
20 |
+
def evaluate_with_gpt4(storyboard, video_description):
|
21 |
+
system_prompt = (
|
22 |
+
"You are a film critic evaluating how well a video matches a storyboard.\n"
|
23 |
+
"Rate each of the following from 1 to 10:\n"
|
24 |
+
"- Story Consistency: Does the video follow the scene and emotion described?\n"
|
25 |
+
"- Shot Variety: Does it use interesting or varied camera angles?\n"
|
26 |
+
"- Relevance: Does it suit the intended purpose (role, setting, emotion)?\n\n"
|
27 |
+
"Provide scores and brief justifications for each.\n\n"
|
28 |
+
"Format output as:\n"
|
29 |
+
"{\n"
|
30 |
+
" \"story_consistency\": <score>,\n"
|
31 |
+
" \"shot_variety\": <score>,\n"
|
32 |
+
" \"relevance\": <score>,\n"
|
33 |
+
" \"justification\": \"...\"\n"
|
34 |
+
"}"
|
35 |
+
)
|
36 |
+
|
37 |
+
user_prompt = (
|
38 |
+
f"Storyboard:\n"
|
39 |
+
f"Scene: {storyboard['scene']}\n"
|
40 |
+
f"Shot: {storyboard['shot_type']}\n"
|
41 |
+
f"Emotion: {storyboard['emotion']}\n\n"
|
42 |
+
f"Video Description:\n{video_description}"
|
43 |
+
)
|
44 |
+
|
45 |
+
response = client.chat.completions.create(
|
46 |
+
model="gpt-4o",
|
47 |
+
temperature=0.3,
|
48 |
+
messages=[
|
49 |
+
{"role": "system", "content": system_prompt},
|
50 |
+
{"role": "user", "content": user_prompt}
|
51 |
+
]
|
52 |
+
)
|
53 |
+
|
54 |
+
content = response.choices[0].message.content.strip()
|
55 |
+
return json.loads(content)
|
56 |
+
|
57 |
+
|
58 |
+
def compute_clip_similarity(image_path, text_prompt):
|
59 |
+
image = Image.open(image_path).convert("RGB")
|
60 |
+
inputs = clip_processor(text=[text_prompt], images=image, return_tensors="pt", padding=True)
|
61 |
+
outputs = clip_model(**inputs)
|
62 |
+
logits_per_image = outputs.logits_per_image
|
63 |
+
similarity = logits_per_image.softmax(dim=1).item()
|
64 |
+
return similarity
|
65 |
+
|
66 |
+
|
67 |
+
def compute_motion_score(video_path):
|
68 |
+
cap = cv2.VideoCapture(video_path)
|
69 |
+
prev_gray = None
|
70 |
+
motion_values = []
|
71 |
+
|
72 |
+
while cap.isOpened():
|
73 |
+
ret, frame = cap.read()
|
74 |
+
if not ret:
|
75 |
+
break
|
76 |
+
|
77 |
+
gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
|
78 |
+
if prev_gray is not None:
|
79 |
+
flow = cv2.calcOpticalFlowFarneback(prev_gray, gray, None,
|
80 |
+
0.5, 3, 15, 3, 5, 1.2, 0)
|
81 |
+
magnitude, _ = cv2.cartToPolar(flow[..., 0], flow[..., 1])
|
82 |
+
motion_values.append(np.mean(magnitude))
|
83 |
+
|
84 |
+
prev_gray = gray
|
85 |
+
|
86 |
+
cap.release()
|
87 |
+
return np.mean(motion_values) if motion_values else 0
|
88 |
+
|
89 |
+
|
90 |
+
def compute_temporal_coherence(video_path):
|
91 |
+
cap = cv2.VideoCapture(video_path)
|
92 |
+
prev_frame = None
|
93 |
+
ssim_scores = []
|
94 |
+
|
95 |
+
while cap.isOpened():
|
96 |
+
ret, frame = cap.read()
|
97 |
+
if not ret:
|
98 |
+
break
|
99 |
+
|
100 |
+
gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
|
101 |
+
if prev_frame is not None:
|
102 |
+
score = ssim(prev_frame, gray)
|
103 |
+
ssim_scores.append(score)
|
104 |
+
|
105 |
+
prev_frame = gray
|
106 |
+
|
107 |
+
cap.release()
|
108 |
+
return np.mean(ssim_scores) if ssim_scores else 0
|
109 |
+
|
110 |
+
|
111 |
+
def evaluate_video(storyboard, video_description, video_path, thumbnail_path, text_prompt):
|
112 |
+
gpt_eval = evaluate_with_gpt4(storyboard, video_description)
|
113 |
+
clip_score = compute_clip_similarity(thumbnail_path, text_prompt)
|
114 |
+
motion_score = compute_motion_score(video_path)
|
115 |
+
coherence_score = compute_temporal_coherence(video_path)
|
116 |
+
|
117 |
+
return {
|
118 |
+
"gpt_eval": gpt_eval,
|
119 |
+
"metrics": {
|
120 |
+
"clip_similarity": clip_score,
|
121 |
+
"motion_score": motion_score,
|
122 |
+
"temporal_coherence": coherence_score
|
123 |
+
}
|
124 |
+
}
|
generation.py
ADDED
@@ -0,0 +1,230 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import uuid
|
2 |
+
|
3 |
+
import torch
|
4 |
+
from diffusers.utils import export_to_video
|
5 |
+
from diffusers import AutoencoderKLWan, WanPipeline
|
6 |
+
from diffusers.schedulers.scheduling_unipc_multistep import UniPCMultistepScheduler
|
7 |
+
|
8 |
+
import os
|
9 |
+
import time
|
10 |
+
import requests
|
11 |
+
import json
|
12 |
+
|
13 |
+
from PIL import Image as PIL_Image
|
14 |
+
from google import genai
|
15 |
+
from google.genai import types
|
16 |
+
from google.cloud import aiplatform
|
17 |
+
from google.cloud import storage
|
18 |
+
import matplotlib.pyplot as plt
|
19 |
+
import mediapy as media
|
20 |
+
|
21 |
+
|
22 |
+
def wan_text_to_video(prompt, negative_prompt):
|
23 |
+
# Available models: Wan-AI/Wan2.1-T2V-14B-Diffusers, Wan-AI/Wan2.1-T2V-1.3B-Diffusers
|
24 |
+
# model_id = "Wan-AI/Wan2.1-T2V-14B-Diffusers"
|
25 |
+
model_id = "Wan-AI/Wan2.1-T2V-1.3B-Diffusers"
|
26 |
+
vae = AutoencoderKLWan.from_pretrained(model_id, subfolder="vae", torch_dtype=torch.float32)
|
27 |
+
flow_shift = 5.0 # 5.0 for 720P, 3.0 for 480P
|
28 |
+
scheduler = UniPCMultistepScheduler(prediction_type='flow_prediction', use_flow_sigmas=True, num_train_timesteps=1000, flow_shift=flow_shift)
|
29 |
+
pipe = WanPipeline.from_pretrained(model_id, vae=vae, torch_dtype=torch.bfloat16)
|
30 |
+
pipe.scheduler = scheduler
|
31 |
+
pipe.to("cpu")
|
32 |
+
|
33 |
+
prompt = ("A cat and a dog baking a cake together in a kitchen. The cat is carefully measuring flour, while the "
|
34 |
+
"dog is stirring the batter with a wooden spoon. The kitchen is cozy, with sunlight streaming through "
|
35 |
+
"the window.")
|
36 |
+
negative_prompt = ("Bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, "
|
37 |
+
"images, static, overall gray, worst quality, low quality, JPEG compression residue, ugly, "
|
38 |
+
"incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured, "
|
39 |
+
"misshapen limbs, fused fingers, still picture, messy background, three legs, many people in "
|
40 |
+
"the background, walking backwards")
|
41 |
+
|
42 |
+
output = pipe(
|
43 |
+
prompt=prompt,
|
44 |
+
negative_prompt=negative_prompt,
|
45 |
+
height=720,
|
46 |
+
width=1280,
|
47 |
+
num_frames=81,
|
48 |
+
guidance_scale=5.0,
|
49 |
+
).frames[0]
|
50 |
+
export_to_video(output, "output.mp4", fps=16)
|
51 |
+
|
52 |
+
return "output.mp4"
|
53 |
+
|
54 |
+
|
55 |
+
def gcp_veo(prompt: str = "a cat reading a book"):
|
56 |
+
PROJECT_ID = "gcp-credit-applying-to-g-suite"
|
57 |
+
LOCATION = os.environ.get("GOOGLE_CLOUD_REGION", "us-central1")
|
58 |
+
BUCKET_NAME = "dante-test-123456-output"
|
59 |
+
OUTPUT_GCS_PATH = f"gs://{BUCKET_NAME}/videos/output_{int(time.time())}.mp4"
|
60 |
+
|
61 |
+
# Initialize Vertex AI
|
62 |
+
aiplatform.init(project=PROJECT_ID, location=LOCATION)
|
63 |
+
|
64 |
+
# Initialize Generative AI client
|
65 |
+
client = genai.Client(vertexai=True, project=PROJECT_ID, location=LOCATION)
|
66 |
+
|
67 |
+
# Video Generation Pipeline
|
68 |
+
video_model = "veo-2.0-generate-001"
|
69 |
+
# video_model = "veo-3.0-generate-preview"
|
70 |
+
aspect_ratio = "16:9"
|
71 |
+
|
72 |
+
operation = client.models.generate_videos(
|
73 |
+
model=video_model,
|
74 |
+
prompt=prompt,
|
75 |
+
config=types.GenerateVideosConfig(
|
76 |
+
aspect_ratio=aspect_ratio,
|
77 |
+
output_gcs_uri=OUTPUT_GCS_PATH,
|
78 |
+
number_of_videos=1,
|
79 |
+
duration_seconds=5,
|
80 |
+
person_generation="allow_adult",
|
81 |
+
enhance_prompt=True,
|
82 |
+
),
|
83 |
+
)
|
84 |
+
|
85 |
+
# Poll until operation is complete
|
86 |
+
print("Generating video...")
|
87 |
+
while not operation.done:
|
88 |
+
time.sleep(15)
|
89 |
+
operation = client.operations.get(operation)
|
90 |
+
print(f"Operation status: {operation}")
|
91 |
+
|
92 |
+
# Error Handling
|
93 |
+
if operation.error:
|
94 |
+
raise Exception(f"Video generation failed: {operation.error}")
|
95 |
+
|
96 |
+
# Get the generated video URI
|
97 |
+
if operation.response and operation.result.generated_videos:
|
98 |
+
video_uri = operation.result.generated_videos[0].video.uri
|
99 |
+
print(f"Video generated at: {video_uri}")
|
100 |
+
|
101 |
+
# Download the video from GCS to local
|
102 |
+
storage_client = storage.Client(project=PROJECT_ID)
|
103 |
+
bucket = storage_client.bucket(BUCKET_NAME)
|
104 |
+
blob_name = video_uri.replace(f"gs://{BUCKET_NAME}/", "")
|
105 |
+
blob = bucket.blob(blob_name)
|
106 |
+
|
107 |
+
local_output_path = f"output/sample-{uuid.uuid1()}.mp4"
|
108 |
+
|
109 |
+
# Ensure local directory exists
|
110 |
+
os.makedirs(os.path.dirname(local_output_path), exist_ok=True)
|
111 |
+
|
112 |
+
# Download the video
|
113 |
+
blob.download_to_filename(local_output_path)
|
114 |
+
print(f"Video downloaded to: {local_output_path}")
|
115 |
+
|
116 |
+
# Delete the file from GCS
|
117 |
+
blob.delete()
|
118 |
+
print(f"Video deleted from GCS: {video_uri}")
|
119 |
+
|
120 |
+
return local_output_path
|
121 |
+
else:
|
122 |
+
raise Exception("No video generated or response is empty")
|
123 |
+
|
124 |
+
|
125 |
+
def hailuo_text_to_video(
|
126 |
+
prompt: str,
|
127 |
+
model: str = "T2V-01-Director",
|
128 |
+
output_file_name: str = "output.mp4",
|
129 |
+
api_key: str = ""
|
130 |
+
) -> str:
|
131 |
+
def invoke_video_generation()->str:
|
132 |
+
print("-----------------Submit video generation task-----------------")
|
133 |
+
url = "https://api.minimaxi.chat/v1/video_generation"
|
134 |
+
payload = json.dumps({
|
135 |
+
"prompt": prompt,
|
136 |
+
"model": model
|
137 |
+
})
|
138 |
+
headers = {
|
139 |
+
'authorization': 'Bearer ' + api_key,
|
140 |
+
'content-type': 'application/json',
|
141 |
+
}
|
142 |
+
|
143 |
+
response = requests.request("POST", url, headers=headers, data=payload)
|
144 |
+
print(response.text)
|
145 |
+
task_id = response.json()['task_id']
|
146 |
+
print("Video generation task submitted successfully, task ID.:"+task_id)
|
147 |
+
return task_id
|
148 |
+
|
149 |
+
def query_video_generation(task_id: str):
|
150 |
+
url = "https://api.minimaxi.chat/v1/query/video_generation?task_id="+task_id
|
151 |
+
headers = {
|
152 |
+
'authorization': 'Bearer ' + api_key
|
153 |
+
}
|
154 |
+
response = requests.request("GET", url, headers=headers)
|
155 |
+
status = response.json()['status']
|
156 |
+
if status == 'Preparing':
|
157 |
+
print("...Preparing...")
|
158 |
+
return "", 'Preparing'
|
159 |
+
elif status == 'Queueing':
|
160 |
+
print("...In the queue...")
|
161 |
+
return "", 'Queueing'
|
162 |
+
elif status == 'Processing':
|
163 |
+
print("...Generating...")
|
164 |
+
return "", 'Processing'
|
165 |
+
elif status == 'Success':
|
166 |
+
return response.json()['file_id'], "Finished"
|
167 |
+
elif status == 'Fail':
|
168 |
+
return "", "Fail"
|
169 |
+
else:
|
170 |
+
return "", "Unknown"
|
171 |
+
|
172 |
+
|
173 |
+
def fetch_video_result(file_id: str):
|
174 |
+
print("---------------Video generated successfully, downloading now---------------")
|
175 |
+
url = "https://api.minimaxi.chat/v1/files/retrieve?file_id="+file_id
|
176 |
+
headers = {
|
177 |
+
'authorization': 'Bearer '+api_key,
|
178 |
+
}
|
179 |
+
|
180 |
+
response = requests.request("GET", url, headers=headers)
|
181 |
+
print(response.text)
|
182 |
+
|
183 |
+
download_url = response.json()['file']['download_url']
|
184 |
+
print("Video download link:" + download_url)
|
185 |
+
with open(output_file_name, 'wb') as f:
|
186 |
+
f.write(requests.get(download_url).content)
|
187 |
+
print("THe video has been downloaded in:"+os.getcwd()+'/'+output_file_name)
|
188 |
+
|
189 |
+
|
190 |
+
task_id = invoke_video_generation()
|
191 |
+
print("-----------------Video generation task submitted -----------------")
|
192 |
+
while True:
|
193 |
+
time.sleep(10)
|
194 |
+
|
195 |
+
file_id, status = query_video_generation(task_id)
|
196 |
+
if file_id != "":
|
197 |
+
fetch_video_result(file_id)
|
198 |
+
print("---------------Successful---------------")
|
199 |
+
break
|
200 |
+
elif status == "Fail" or status == "Unknown":
|
201 |
+
print("---------------Failed---------------")
|
202 |
+
break
|
203 |
+
|
204 |
+
return os.getcwd()+'/'+output_file_name
|
205 |
+
|
206 |
+
def generate_video(prompt, model_id, negative_prompt=None):
|
207 |
+
video_path = None
|
208 |
+
if model_id == "Wan2.1":
|
209 |
+
video_path = wan_text_to_video(prompt, negative_prompt)
|
210 |
+
elif model_id == "SkyReels-V2":
|
211 |
+
raise ValueError("SkyReels-V2 model not yet implemented.")
|
212 |
+
elif model_id == "Veo-2":
|
213 |
+
video_path = gcp_veo(prompt)
|
214 |
+
elif model_id == "T2V-01-Director":
|
215 |
+
video_path = hailuo_text_to_video(prompt)
|
216 |
+
return video_path
|
217 |
+
|
218 |
+
# Only available for cuda / cpu
|
219 |
+
# wan_text_to_video()
|
220 |
+
|
221 |
+
|
222 |
+
# if __name__ == "__main__":
|
223 |
+
# try:
|
224 |
+
# local_path = gcp_veo_3(
|
225 |
+
# prompt="a cat reading a book",
|
226 |
+
# local_output_path="output/cat_reading_book.mp4"
|
227 |
+
# )
|
228 |
+
# print(f"Success! Video saved at: {local_path}")
|
229 |
+
# except Exception as e:
|
230 |
+
# print(f"Error: {e}")
|
main.py
ADDED
@@ -0,0 +1,169 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Dict
|
2 |
+
|
3 |
+
import gradio as gr
|
4 |
+
import json
|
5 |
+
|
6 |
+
from cinematic_planning import build_scene_sequence
|
7 |
+
from generation import generate_video
|
8 |
+
from prompt_template_control import generate_video_prompt_with_template
|
9 |
+
from storyboard import generate_multiple_storyboards
|
10 |
+
|
11 |
+
|
12 |
+
def save_storyboard_choice(choice: Dict[str, str]):
|
13 |
+
# Save the full dictionary as JSON (append mode)
|
14 |
+
with open("selected_storyboards.json", "a") as f:
|
15 |
+
f.write(json.dumps(choice) + "\n")
|
16 |
+
return f"✅ Saved your selection to selected_storyboards.json:\n\n{json.dumps(choice, indent=2)}"
|
17 |
+
|
18 |
+
|
19 |
+
# Connect button
|
20 |
+
def run_pseudo_video_workflow(scene, shot_type, emotion, model_choice, num_keyframes):
|
21 |
+
# Build storyboard dict
|
22 |
+
storyboard = {
|
23 |
+
"scene": scene,
|
24 |
+
"shot_type": shot_type,
|
25 |
+
"emotion": emotion
|
26 |
+
}
|
27 |
+
|
28 |
+
# Call your iterative builder
|
29 |
+
scene_sequence = build_scene_sequence(
|
30 |
+
storyboard, model_choice, num_keyframes=num_keyframes
|
31 |
+
)
|
32 |
+
|
33 |
+
# Format result as text
|
34 |
+
result_text = ""
|
35 |
+
for i, step in enumerate(scene_sequence):
|
36 |
+
result_text += f"\nKeyframe {i + 1}:\n"
|
37 |
+
result_text += f"Transition: {step['transition_text']}\n"
|
38 |
+
result_text += f"Video Path: {step['video_path']}\n"
|
39 |
+
|
40 |
+
return result_text
|
41 |
+
|
42 |
+
|
43 |
+
if __name__ == "__main__":
|
44 |
+
with gr.Blocks() as demo:
|
45 |
+
gr.Markdown("# 🎥 Video Generator")
|
46 |
+
|
47 |
+
# Video Generator Interface
|
48 |
+
with gr.Row():
|
49 |
+
with gr.Column():
|
50 |
+
video_prompt = gr.Textbox(label="Enter your video prompt")
|
51 |
+
negative_prompt = gr.Textbox(label="Enter your negative prompt (optional: Wan2.1 Only)")
|
52 |
+
model_choice = gr.Radio(
|
53 |
+
choices=["SkyReels-V2", "Wan2.1", "Veo-2", "T2V-01-Director"],
|
54 |
+
label="Choose the video generation model"
|
55 |
+
)
|
56 |
+
generate_btn = gr.Button("Generate Video")
|
57 |
+
with gr.Column():
|
58 |
+
video_output = gr.Video(label="Generated Video")
|
59 |
+
|
60 |
+
generate_btn.click(
|
61 |
+
generate_video,
|
62 |
+
inputs=[video_prompt, model_choice, negative_prompt],
|
63 |
+
outputs=video_output
|
64 |
+
)
|
65 |
+
|
66 |
+
# Divider
|
67 |
+
gr.Markdown("---")
|
68 |
+
|
69 |
+
# Narrative to Storyboard interface
|
70 |
+
gr.Markdown("# 🎬 Narrative to Storyboard Grounding")
|
71 |
+
narrative_input = gr.Textbox(label="Enter your narrative")
|
72 |
+
generate_storyboards_btn = gr.Button("Generate 5 Storyboards")
|
73 |
+
storyboards_output = gr.Radio(
|
74 |
+
choices=[],
|
75 |
+
label="Select your preferred storyboard"
|
76 |
+
)
|
77 |
+
save_choice_btn = gr.Button("Save Selection")
|
78 |
+
save_output = gr.Textbox(label="Save Output", interactive=False)
|
79 |
+
|
80 |
+
# Generate the storyboards
|
81 |
+
def update_storyboards(narrative):
|
82 |
+
cards = generate_multiple_storyboards(narrative)
|
83 |
+
return gr.update(choices=cards)
|
84 |
+
|
85 |
+
|
86 |
+
generate_storyboards_btn.click(
|
87 |
+
update_storyboards,
|
88 |
+
inputs=narrative_input,
|
89 |
+
outputs=storyboards_output
|
90 |
+
)
|
91 |
+
|
92 |
+
# Save the choice
|
93 |
+
save_choice_btn.click(
|
94 |
+
save_storyboard_choice,
|
95 |
+
inputs=storyboards_output,
|
96 |
+
outputs=save_output
|
97 |
+
)
|
98 |
+
|
99 |
+
gr.Markdown("---")
|
100 |
+
|
101 |
+
# Prompt Injection + Template Control
|
102 |
+
gr.Markdown("# 🎥 Prompt Injection + Template Control (LLM + T2V)")
|
103 |
+
|
104 |
+
# Modular controls
|
105 |
+
role_input = gr.Textbox(label="Role", placeholder="e.g., Product demo")
|
106 |
+
setting_input = gr.Textbox(label="Setting", placeholder="e.g., Urban bar")
|
107 |
+
emotion_input = gr.Textbox(label="Emotion", placeholder="e.g., Energetic")
|
108 |
+
shot_input = gr.Textbox(label="Shot Type", placeholder="e.g., Front-facing")
|
109 |
+
duration_input = gr.Textbox(label="Duration", placeholder="e.g., 5s loop")
|
110 |
+
|
111 |
+
# Model selection
|
112 |
+
model_choice = gr.Radio(
|
113 |
+
choices=["SkyReels-V2", "Veo-2", "Runway", "T2V-01-Director"],
|
114 |
+
label="Choose video generation model"
|
115 |
+
)
|
116 |
+
|
117 |
+
# Generate final natural language prompt
|
118 |
+
generate_prompt_btn = gr.Button("Generate Final Prompt")
|
119 |
+
final_prompt_output = gr.Textbox(label="Final Video Prompt", interactive=False)
|
120 |
+
|
121 |
+
# Generate video
|
122 |
+
generate_video_btn = gr.Button("Generate Video")
|
123 |
+
video_output = gr.Video(label="Generated Video")
|
124 |
+
|
125 |
+
# Connect callbacks
|
126 |
+
generate_prompt_btn.click(
|
127 |
+
generate_video_prompt_with_template,
|
128 |
+
inputs=[role_input, setting_input, emotion_input, shot_input, duration_input],
|
129 |
+
outputs=final_prompt_output
|
130 |
+
)
|
131 |
+
|
132 |
+
generate_video_btn.click(
|
133 |
+
generate_video,
|
134 |
+
inputs=[final_prompt_output, model_choice, negative_prompt],
|
135 |
+
outputs=video_output
|
136 |
+
)
|
137 |
+
|
138 |
+
gr.Markdown("# 🎞️ Pseudo Video Workflow (Storyboard → Scene Builder)")
|
139 |
+
|
140 |
+
# Storyboard inputs
|
141 |
+
pseudo_scene_input = gr.Textbox(label="Scene", placeholder="e.g., Misty forest")
|
142 |
+
pseudo_shot_input = gr.Textbox(label="Shot Type", placeholder="e.g., Wide shot")
|
143 |
+
pseudo_emotion_input = gr.Textbox(label="Emotion", placeholder="e.g., Mysterious")
|
144 |
+
|
145 |
+
pseudo_model_choice = gr.Radio(
|
146 |
+
choices=["SkyReels-V2", "Wan2.1", "Veo-2", "T2V-01-Director"],
|
147 |
+
label="Choose video generation model"
|
148 |
+
)
|
149 |
+
|
150 |
+
num_keyframes_input = gr.Slider(minimum=1, maximum=20, value=12, label="Number of Keyframes")
|
151 |
+
|
152 |
+
run_pseudo_video_btn = gr.Button("Build Pseudo Video Workflow")
|
153 |
+
|
154 |
+
pseudo_output = gr.Textbox(label="Workflow Result", lines=10)
|
155 |
+
|
156 |
+
# Hook to Gradio button
|
157 |
+
run_pseudo_video_btn.click(
|
158 |
+
run_pseudo_video_workflow,
|
159 |
+
inputs=[
|
160 |
+
pseudo_scene_input,
|
161 |
+
pseudo_shot_input,
|
162 |
+
pseudo_emotion_input,
|
163 |
+
pseudo_model_choice,
|
164 |
+
num_keyframes_input
|
165 |
+
],
|
166 |
+
outputs=pseudo_output
|
167 |
+
)
|
168 |
+
|
169 |
+
demo.launch()
|
prompt_template_control.py
ADDED
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from openai import OpenAI
|
2 |
+
from dotenv import load_dotenv
|
3 |
+
|
4 |
+
# Env variable
|
5 |
+
load_dotenv()
|
6 |
+
|
7 |
+
# Initialize OpenAI client
|
8 |
+
client = OpenAI()
|
9 |
+
|
10 |
+
|
11 |
+
def generate_video_prompt_with_template(role: str, setting: str, emotion: str, shot: str, duration: str) -> str:
|
12 |
+
system_prompt = (
|
13 |
+
"You are a video director who converts structured metadata into detailed, natural language video prompts.\n"
|
14 |
+
"Here are examples:\n\n"
|
15 |
+
"Example 1:\n"
|
16 |
+
"- Role: Product demo\n"
|
17 |
+
"- Setting: Urban bar\n"
|
18 |
+
"- Emotion: Energetic\n"
|
19 |
+
"- Shot: Front-facing, 5s loop\n"
|
20 |
+
"Output: \"Create a short 5-second video of a product demo in an energetic tone. "
|
21 |
+
"The scene takes place in an urban bar setting, using a front-facing camera to capture the vibrant atmosphere.\"\n\n"
|
22 |
+
"Example 2:\n"
|
23 |
+
"- Role: Storytelling\n"
|
24 |
+
"- Setting: Forest, misty\n"
|
25 |
+
"- Emotion: Mysterious\n"
|
26 |
+
"- Shot: Wide shot\n"
|
27 |
+
"Output: \"Create a video showing a mysterious scene in a misty forest. Use a wide shot to capture the atmosphere and suspense.\"\n\n"
|
28 |
+
"Now, create a natural language video prompt for the following:\n"
|
29 |
+
)
|
30 |
+
|
31 |
+
user_prompt = (
|
32 |
+
f"- Role: {role}\n"
|
33 |
+
f"- Setting: {setting}\n"
|
34 |
+
f"- Emotion: {emotion}\n"
|
35 |
+
f"- Shot: {shot}, {duration}\n"
|
36 |
+
)
|
37 |
+
|
38 |
+
response = client.chat.completions.create(
|
39 |
+
model="gpt-4o",
|
40 |
+
messages=[
|
41 |
+
{"role": "system", "content": system_prompt},
|
42 |
+
{"role": "user", "content": user_prompt}
|
43 |
+
],
|
44 |
+
temperature=0.3
|
45 |
+
)
|
46 |
+
|
47 |
+
final_prompt = response.choices[0].message.content.strip()
|
48 |
+
return final_prompt
|
requirements.txt
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
torch
|
2 |
+
diffusers
|
3 |
+
transformers
|
4 |
+
ftfy==6.3.1
|
5 |
+
gradio
|
6 |
+
google-genai
|
7 |
+
mediapy
|
8 |
+
google-cloud-aiplatform
|
9 |
+
openai
|
10 |
+
dotenv
|
selected_storyboards.json
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"scene": "A dense, shadowy forest shrouded in mist. The trees are tall and imposing, their branches intertwining overhead, creating a canopy that blocks out the moonlight. The ground is covered in a thick layer of fallen leaves, and the air is filled with the sound of distant rustling and the occasional hoot of an owl.",
|
3 |
+
"shot_type": "Wide shot",
|
4 |
+
"emotion": "Eerie and mysterious",
|
5 |
+
"version": 4
|
6 |
+
}
|
storyboard.py
ADDED
@@ -0,0 +1,85 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import re
|
3 |
+
from typing import List, Dict
|
4 |
+
|
5 |
+
from dotenv import load_dotenv
|
6 |
+
from typing import Dict
|
7 |
+
from openai import OpenAI
|
8 |
+
|
9 |
+
|
10 |
+
# Env variable
|
11 |
+
load_dotenv()
|
12 |
+
|
13 |
+
# Initialize OpenAI client
|
14 |
+
client = OpenAI()
|
15 |
+
|
16 |
+
|
17 |
+
def narrative_to_storyboard(narrative: str) -> Dict[str, str]:
|
18 |
+
"""
|
19 |
+
Converts a narrative prompt into a structured storyboard dict
|
20 |
+
with scene, shot type, and emotion using an LLM.
|
21 |
+
"""
|
22 |
+
system_prompt = (
|
23 |
+
"You are a professional storyboard artist and cinematographer. "
|
24 |
+
"Given a narrative, extract and describe:\n"
|
25 |
+
"- scene: The environment and visual setting\n"
|
26 |
+
"- shot_type: The camera angle or framing (e.g., wide shot, close-up)\n"
|
27 |
+
"- emotion: The overall mood or emotional tone\n\n"
|
28 |
+
"Return the result as a JSON dictionary with keys: scene, shot_type, emotion."
|
29 |
+
)
|
30 |
+
|
31 |
+
user_prompt = f"Narrative: {narrative}"
|
32 |
+
|
33 |
+
response = client.chat.completions.create(
|
34 |
+
model="gpt-4o",
|
35 |
+
messages=[
|
36 |
+
{"role": "system", "content": system_prompt},
|
37 |
+
{"role": "user", "content": user_prompt}
|
38 |
+
],
|
39 |
+
temperature=0.3
|
40 |
+
)
|
41 |
+
|
42 |
+
content = response.choices[0].message.content.strip()
|
43 |
+
|
44 |
+
# Use regex to extract JSON block
|
45 |
+
json_match = re.search(r"\{[\s\S]*\}", content)
|
46 |
+
if json_match:
|
47 |
+
json_str = json_match.group(0)
|
48 |
+
try:
|
49 |
+
parsed_json = json.loads(json_str)
|
50 |
+
return parsed_json
|
51 |
+
except json.JSONDecodeError as e:
|
52 |
+
print(f"JSON decode error: {e}")
|
53 |
+
return {"scene": "", "shot_type": "", "emotion": ""}
|
54 |
+
else:
|
55 |
+
print("No JSON block found in response.")
|
56 |
+
return {"scene": "", "shot_type": "", "emotion": ""}
|
57 |
+
|
58 |
+
|
59 |
+
def generate_multiple_storyboards(narrative: str, num_versions: int = 5) -> List[Dict[str, str]]:
|
60 |
+
"""
|
61 |
+
Generate multiple storyboards for the same narrative by calling the LLM-based
|
62 |
+
narrative_to_storyboard() function multiple times with slight variations.
|
63 |
+
"""
|
64 |
+
storyboards = []
|
65 |
+
|
66 |
+
for i in range(num_versions):
|
67 |
+
# Add variation to the narrative to encourage different outputs
|
68 |
+
variant_narrative = f"{narrative}\nPlease provide a different creative version #{i+1}."
|
69 |
+
storyboard = narrative_to_storyboard(variant_narrative)
|
70 |
+
storyboard['version'] = i + 1 # Track version number
|
71 |
+
storyboards.append(storyboard)
|
72 |
+
|
73 |
+
return storyboards
|
74 |
+
|
75 |
+
|
76 |
+
if __name__ == "__main__":
|
77 |
+
print("Testing Narrative to Storyboard...")
|
78 |
+
narrative_text = "A girl walks into a dark forest on a misty night."
|
79 |
+
# storyboard_output = narrative_to_storyboard(narrative_text)
|
80 |
+
# print(storyboard_output)
|
81 |
+
|
82 |
+
print("Generate 5 Storyboard based on the narrative...")
|
83 |
+
storyboard_list = generate_multiple_storyboards(narrative_text)
|
84 |
+
for i, sb in enumerate(storyboard_list):
|
85 |
+
print(f"Version {i + 1}: {sb}")
|