| |
|
| |
|
| |
|
| |
|
| | from pathlib import Path
|
| | import pickle
|
| | import numpy as np
|
| | import math
|
| | import matplotlib.pyplot as plt
|
| | from PIL import Image
|
| | from skimage.draw import polygon
|
| | from third_party.yoloworld_demo import get_2dbox_open_vocabulary_detector
|
| | from third_party.depth_demo import get_3d_location
|
| |
|
| |
|
| | class FuncAgent:
|
| | def __init__(self, data_dict=None, json_data_dict=None) -> None:
|
| | """
|
| | Initialize function agent for visual processing tasks
|
| |
|
| | Args:
|
| | data_dict: Dictionary containing scene data
|
| | json_data_dict: Dictionary containing JSON metadata
|
| | """
|
| | self.data = data_dict
|
| | self.json_data_dict = json_data_dict
|
| | self.short_trajectory_description = False
|
| |
|
| |
|
| | self.visual_func_infos = [
|
| | get_open_world_vocabulary_detection_info,
|
| | get_3d_loc_in_cam_info,
|
| | resize_image_info,
|
| | crop_image_info,
|
| | ]
|
| |
|
| | def get_open_world_vocabulary_detection(self, object_names: list, cam_type: str):
|
| | """
|
| | Detect objects in an image using open vocabulary detection
|
| |
|
| | Args:
|
| | object_names: List of objects to detect
|
| | cam_type: Camera type to process
|
| |
|
| | Returns:
|
| | Tuple of prompts and detected bounding boxes
|
| | """
|
| | cam_path_info_list = self.json_data_dict['image']
|
| | for cam_path_info in cam_path_info_list:
|
| | if cam_type == cam_path_info.split('/')[1]:
|
| | cur_cam_type_index = cam_path_info_list.index(cam_path_info)
|
| |
|
| | choosed_image_path = cam_path_info_list[cur_cam_type_index]
|
| | prompts, detected_2d_boxs = get_2dbox_open_vocabulary_detector(
|
| | text=object_names,
|
| | image_path=choosed_image_path
|
| | )
|
| |
|
| | return prompts, detected_2d_boxs
|
| |
|
| | def get_open_world_vocabulary_detection_info(self, object_names: list, image_path: str):
|
| | """
|
| | Detect objects in an image using open vocabulary detection
|
| |
|
| | Args:
|
| | object_names: List of objects to detect
|
| | image_path: Path to the image file
|
| |
|
| | Returns:
|
| | Tuple of prompts and detected bounding boxes
|
| | """
|
| | prompts, detected_2d_boxs = get_2dbox_open_vocabulary_detector(
|
| | text=object_names,
|
| | image_path=image_path
|
| | )
|
| | return prompts, detected_2d_boxs
|
| |
|
| | def get_3d_loc_in_cam_info(self, object_names: list, image_path: str):
|
| | """
|
| | Get 3D locations of objects in camera coordinates
|
| |
|
| | Args:
|
| | object_names: List of objects to locate
|
| | image_path: Path to the image file
|
| |
|
| | Returns:
|
| | Tuple of prompts and 3D locations
|
| | """
|
| | prompts, detected_loc_3d = get_3d_location(
|
| | text=object_names,
|
| | image_path=image_path
|
| | )
|
| | return prompts, detected_loc_3d
|
| |
|
| | def get_ego_states(self):
|
| | """Get ego vehicle state information"""
|
| | return get_ego_prompts(self.data)
|
| |
|
| |
|
| |
|
| | resize_image_info = {
|
| | "name": "resize_image",
|
| | "description": "Resizes an image to specified dimensions with interpolation support",
|
| | "parameters": {
|
| | "type": "object",
|
| | "properties": {
|
| | "input_path": {"type": "string", "description": "Input image file path"},
|
| | "output_path": {"type": "string", "description": "Output path for resized image"},
|
| | "target_size": {
|
| | "type": "array",
|
| | "items": {"type": "integer"},
|
| | "minItems": 2,
|
| | "maxItems": 2,
|
| | "description": "Target dimensions [width, height]"
|
| | },
|
| | "interpolation": {
|
| | "type": "integer",
|
| | "description": "Interpolation method (e.g., Image.BILINEAR for bilinear interpolation)"
|
| | }
|
| | },
|
| | "required": ["input_path", "output_path", "target_size"]
|
| | }
|
| | }
|
| |
|
| |
|
| | def resize_image(input_path, output_path, target_size, interpolation=Image.BILINEAR):
|
| | """
|
| | Resize an image to specified dimensions
|
| |
|
| | Args:
|
| | input_path: Path to input image file
|
| | output_path: Path to save resized image
|
| | target_size: Target dimensions (width, height)
|
| | interpolation: Interpolation method (default: bilinear)
|
| | """
|
| | with Image.open(input_path) as img:
|
| | resized_img = img.resize(target_size, interpolation)
|
| | resized_img.save(output_path)
|
| |
|
| |
|
| | crop_image_info = {
|
| | "name": "crop_image",
|
| | "description": "Crops a rectangular region from an image",
|
| | "parameters": {
|
| | "type": "object",
|
| | "properties": {
|
| | "input_path": {"type": "string", "description": "Input image file path"},
|
| | "output_path": {"type": "string", "description": "Output path for cropped image"},
|
| | "box": {
|
| | "type": "array",
|
| | "items": {"type": "integer"},
|
| | "minItems": 4,
|
| | "maxItems": 4,
|
| | "description": "Crop region coordinates [left, upper, right, lower]"
|
| | }
|
| | },
|
| | "required": ["input_path", "output_path", "box"]
|
| | }
|
| | }
|
| |
|
| |
|
| | def crop_image(input_path, output_path, box):
|
| | """
|
| | Crop a region from an image
|
| |
|
| | Args:
|
| | input_path: Path to input image file
|
| | output_path: Path to save cropped image
|
| | box: Crop region coordinates (left, upper, right, lower)
|
| | """
|
| | with Image.open(input_path) as img:
|
| | cropped_img = img.crop(box)
|
| | cropped_img.save(output_path)
|
| |
|
| |
|
| | rotate_image_info = {
|
| | "name": "rotate_image",
|
| | "description": "Rotates an image by specified degrees with canvas expansion support",
|
| | "parameters": {
|
| | "type": "object",
|
| | "properties": {
|
| | "input_path": {"type": "string", "description": "Input image file path"},
|
| | "output_path": {"type": "string", "description": "Output path for rotated image"},
|
| | "degrees": {"type": "number", "description": "Rotation angle in degrees (clockwise)"},
|
| | "expand": {
|
| | "type": "boolean",
|
| | "description": "Whether to expand canvas to fit rotation (default: False)"
|
| | },
|
| | "fill_color": {
|
| | "type": "array",
|
| | "items": {"type": "integer"},
|
| | "minItems": 3,
|
| | "maxItems": 3,
|
| | "description": "RGB fill color for expanded areas (default: [255,255,255])"
|
| | }
|
| | },
|
| | "required": ["input_path", "output_path", "degrees"]
|
| | }
|
| | }
|
| |
|
| |
|
| | def rotate_image(input_path, output_path, degrees, expand=False, fill_color=(255, 255, 255)):
|
| | """
|
| | Rotate an image by specified degrees
|
| |
|
| | Args:
|
| | input_path: Path to input image file
|
| | output_path: Path to save rotated image
|
| | degrees: Rotation angle in degrees
|
| | expand: Whether to expand canvas to fit rotation
|
| | fill_color: Fill color for expanded areas
|
| | """
|
| | with Image.open(input_path) as img:
|
| | rotated_img = img.rotate(degrees, expand=expand, fillcolor=fill_color)
|
| | rotated_img.save(output_path)
|
| |
|
| |
|
| | adjust_brightness_info = {
|
| | "name": "adjust_brightness",
|
| | "description": "Adjusts image brightness using enhancement factor",
|
| | "parameters": {
|
| | "type": "object",
|
| | "properties": {
|
| | "input_path": {"type": "string", "description": "Input image file path"},
|
| | "output_path": {"type": "string", "description": "Output path for adjusted image"},
|
| | "factor": {
|
| | "type": "number",
|
| | "description": "Brightness multiplier (1.0=original, >1.0=brighter, <1.0=darker)"
|
| | }
|
| | },
|
| | "required": ["input_path", "output_path", "factor"]
|
| | }
|
| | }
|
| |
|
| |
|
| | def adjust_brightness(input_path, output_path, factor):
|
| | """
|
| | Adjust image brightness
|
| |
|
| | Args:
|
| | input_path: Path to input image file
|
| | output_path: Path to save adjusted image
|
| | factor: Brightness multiplier (1.0=original, >1.0=brighter, <1.0=darker)
|
| | """
|
| | with Image.open(input_path) as img:
|
| | enhancer = ImageEnhance.Brightness(img)
|
| | bright_img = enhancer.enhance(factor)
|
| | bright_img.save(output_path)
|
| |
|
| |
|
| | get_open_world_vocabulary_detection_info = {
|
| | "name": "get_open_world_vocabulary_detection",
|
| | "description": "Detects objects in an image using open vocabulary detection",
|
| | "parameters": {
|
| | "type": "object",
|
| | "properties": {
|
| | "text": {
|
| | "type": "list",
|
| | "description": "List of objects to detect",
|
| | },
|
| | "image_path": {
|
| | "type": "str",
|
| | "description": "Path to the image file"
|
| | }
|
| | },
|
| | "required": ["text", "image_path"],
|
| | },
|
| | }
|
| |
|
| |
|
| | get_3d_loc_in_cam_info = {
|
| | "name": "get_3d_loc_in_cam",
|
| | "description": "Calculates 3D locations of objects in camera coordinates",
|
| | "parameters": {
|
| | "type": "object",
|
| | "properties": {
|
| | "text": {
|
| | "type": "list",
|
| | "description": "List of objects to locate",
|
| | },
|
| | "image_path": {
|
| | "type": "str",
|
| | "description": "Path to the image file"
|
| | }
|
| | },
|
| | "required": ["text", "image_path"],
|
| | },
|
| | } |