import os import utils def image_to_text_prompt(image_path: str, metadata: dict = None) -> str: """Generate a text prompt to represent a image file with its metadata.""" metadata = metadata or {} metadata_lines = '\n'.join(f'- {key}: {value}' for key, value in metadata.items()) if metadata_lines: metadata_lines = '\n' + metadata_lines return f''' Filename: {os.path.basename(image_path)} Metadata: {metadata_lines} ''' def video_to_text_prompt(video_path: str, metadata: dict = None) -> str: """Generate a text prompt to represent a video file with its metadata.""" metadata = metadata or {} metadata_lines = '\n'.join(f'- {key}: {value}' for key, value in metadata.items()) if metadata_lines: metadata_lines = '\n' + metadata_lines return f''' ''' def video_segment_to_text_prompt( start: float, end: float, transcript_segments: list[dict], frame_paths: list[str] ) -> str: """Generate a text prompt to represent a video segment with its timespan, transcript segments, and frame images.""" # include timespans timespan_text = f'{utils.seconds_to_hms(int(start))} - {utils.seconds_to_hms(int(end))}' # include transcript segments transcript_texts = [] for segment in transcript_segments: transcript_texts.append( f'- {utils.seconds_to_hms(int(segment["start"]), drop_hours=True)}' f'-{utils.seconds_to_hms(int(segment["end"]), drop_hours=True)}: {segment["text"]}') transcript_lines = '\n'.join(transcript_texts) if transcript_lines: transcript_lines = '\n' + transcript_lines # include frame images image_tags = [] for frame_path in frame_paths: image_tags.append(f'{frame_path}') frame_images_lines = '\n'.join(image_tags) return f''' Timespan: {timespan_text} Transcript: {transcript_lines} {frame_images_lines} '''