Spaces:
Running
Running
# Copyright 2018 The TensorFlow Authors All Rights Reserved. | |
# | |
# Licensed under the Apache License, Version 2.0 (the "License"); | |
# you may not use this file except in compliance with the License. | |
# You may obtain a copy of the License at | |
# | |
# http://www.apache.org/licenses/LICENSE-2.0 | |
# | |
# Unless required by applicable law or agreed to in writing, software | |
# distributed under the License is distributed on an "AS IS" BASIS, | |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
# See the License for the specific language governing permissions and | |
# limitations under the License. | |
# ============================================================================== | |
"""Gym environment for the ActiveVision Dataset. | |
The dataset is captured with a robot moving around and taking picture in | |
multiple directions. The actions are moving in four directions, and rotate | |
clockwise or counter clockwise. The observations are the output of vision | |
pipelines such as object detectors. The goal is to find objects of interest | |
in each environment. For more details, refer: | |
http://cs.unc.edu/~ammirato/active_vision_dataset_website/. | |
""" | |
import tensorflow as tf | |
import collections | |
import copy | |
import json | |
import os | |
from StringIO import StringIO | |
import time | |
import gym | |
from gym.envs.registration import register | |
import gym.spaces | |
import networkx as nx | |
import numpy as np | |
import scipy.io as sio | |
from absl import logging | |
import gin | |
import cv2 | |
import label_map_util | |
import visualization_utils as vis_util | |
from envs import task_env | |
register( | |
id='active-vision-env-v0', | |
entry_point= | |
'cognitive_planning.envs.active_vision_dataset_env:ActiveVisionDatasetEnv', # pylint: disable=line-too-long | |
) | |
_MAX_DEPTH_VALUE = 12102 | |
SUPPORTED_ACTIONS = [ | |
'right', 'rotate_cw', 'rotate_ccw', 'forward', 'left', 'backward', 'stop' | |
] | |
SUPPORTED_MODALITIES = [ | |
task_env.ModalityTypes.SEMANTIC_SEGMENTATION, | |
task_env.ModalityTypes.DEPTH, | |
task_env.ModalityTypes.OBJECT_DETECTION, | |
task_env.ModalityTypes.IMAGE, | |
task_env.ModalityTypes.GOAL, | |
task_env.ModalityTypes.PREV_ACTION, | |
task_env.ModalityTypes.DISTANCE, | |
] | |
# Data structure for storing the information related to the graph of the world. | |
_Graph = collections.namedtuple('_Graph', [ | |
'graph', 'id_to_index', 'index_to_id', 'target_indexes', 'distance_to_goal' | |
]) | |
def _init_category_index(label_map_path): | |
"""Creates category index from class indexes to name of the classes. | |
Args: | |
label_map_path: path to the mapping. | |
Returns: | |
A map for mapping int keys to string categories. | |
""" | |
label_map = label_map_util.load_labelmap(label_map_path) | |
num_classes = np.max(x.id for x in label_map.item) | |
categories = label_map_util.convert_label_map_to_categories( | |
label_map, max_num_classes=num_classes, use_display_name=True) | |
category_index = label_map_util.create_category_index(categories) | |
return category_index | |
def _draw_detections(image_np, detections, category_index): | |
"""Draws detections on to the image. | |
Args: | |
image_np: Image in the form of uint8 numpy array. | |
detections: a dictionary that contains the detection outputs. | |
category_index: contains the mapping between indexes and the category names. | |
Returns: | |
Does not return anything but draws the boxes on the | |
""" | |
vis_util.visualize_boxes_and_labels_on_image_array( | |
image_np, | |
detections['detection_boxes'], | |
detections['detection_classes'], | |
detections['detection_scores'], | |
category_index, | |
use_normalized_coordinates=True, | |
max_boxes_to_draw=1000, | |
min_score_thresh=.0, | |
agnostic_mode=False) | |
def generate_detection_image(detections, | |
image_size, | |
category_map, | |
num_classes, | |
is_binary=True): | |
"""Generates one_hot vector of the image using the detection boxes. | |
Args: | |
detections: 2D object detections from the image. It's a dictionary that | |
contains detection_boxes, detection_classes, and detection_scores with | |
dimensions of nx4, nx1, nx1 where n is the number of detections. | |
image_size: The resolution of the output image. | |
category_map: dictionary that maps label names to index. | |
num_classes: Number of classes. | |
is_binary: If true, it sets the corresponding channels to 0 and 1. | |
Otherwise, sets the score in the corresponding channel. | |
Returns: | |
Returns image_size x image_size x num_classes image for the detection boxes. | |
""" | |
res = np.zeros((image_size, image_size, num_classes), dtype=np.float32) | |
boxes = detections['detection_boxes'] | |
labels = detections['detection_classes'] | |
scores = detections['detection_scores'] | |
for box, label, score in zip(boxes, labels, scores): | |
transformed_boxes = [int(round(t)) for t in box * image_size] | |
y1, x1, y2, x2 = transformed_boxes | |
# Detector returns fixed number of detections. Boxes with area of zero | |
# are equivalent of boxes that don't correspond to any detection box. | |
# So, we need to skip the boxes with area 0. | |
if (y2 - y1) * (x2 - x1) == 0: | |
continue | |
assert category_map[label] < num_classes, 'label = {}'.format(label) | |
value = score | |
if is_binary: | |
value = 1 | |
res[y1:y2, x1:x2, category_map[label]] = value | |
return res | |
def _get_detection_path(root, detection_folder_name, world): | |
return os.path.join(root, 'Meta', detection_folder_name, world + '.npy') | |
def _get_image_folder(root, world): | |
return os.path.join(root, world, 'jpg_rgb') | |
def _get_json_path(root, world): | |
return os.path.join(root, world, 'annotations.json') | |
def _get_image_path(root, world, image_id): | |
return os.path.join(_get_image_folder(root, world), image_id + '.jpg') | |
def _get_image_list(path, worlds): | |
"""Builds a dictionary for all the worlds. | |
Args: | |
path: the path to the dataset on cns. | |
worlds: list of the worlds. | |
Returns: | |
dictionary where the key is the world names and the values | |
are the image_ids of that world. | |
""" | |
world_id_dict = {} | |
for loc in worlds: | |
files = [t[:-4] for t in tf.gfile.ListDir(_get_image_folder(path, loc))] | |
world_id_dict[loc] = files | |
return world_id_dict | |
def read_all_poses(dataset_root, world): | |
"""Reads all the poses for each world. | |
Args: | |
dataset_root: the path to the root of the dataset. | |
world: string, name of the world. | |
Returns: | |
Dictionary of poses for all the images in each world. The key is the image | |
id of each view and the values are tuple of (x, z, R, scale). Where x and z | |
are the first and third coordinate of translation. R is the 3x3 rotation | |
matrix and scale is a float scalar that indicates the scale that needs to | |
be multipled to x and z in order to get the real world coordinates. | |
Raises: | |
ValueError: if the number of images do not match the number of poses read. | |
""" | |
path = os.path.join(dataset_root, world, 'image_structs.mat') | |
with tf.gfile.Open(path) as f: | |
data = sio.loadmat(f) | |
xyz = data['image_structs']['world_pos'] | |
image_names = data['image_structs']['image_name'][0] | |
rot = data['image_structs']['R'][0] | |
scale = data['scale'][0][0] | |
n = xyz.shape[1] | |
x = [xyz[0][i][0][0] for i in range(n)] | |
z = [xyz[0][i][2][0] for i in range(n)] | |
names = [name[0][:-4] for name in image_names] | |
if len(names) != len(x): | |
raise ValueError('number of image names are not equal to the number of ' | |
'poses {} != {}'.format(len(names), len(x))) | |
output = {} | |
for i in range(n): | |
if rot[i].shape[0] != 0: | |
assert rot[i].shape[0] == 3 | |
assert rot[i].shape[1] == 3 | |
output[names[i]] = (x[i], z[i], rot[i], scale) | |
else: | |
output[names[i]] = (x[i], z[i], None, scale) | |
return output | |
def read_cached_data(should_load_images, dataset_root, segmentation_file_name, | |
targets_file_name, output_size): | |
"""Reads all the necessary cached data. | |
Args: | |
should_load_images: whether to load the images or not. | |
dataset_root: path to the root of the dataset. | |
segmentation_file_name: The name of the file that contains semantic | |
segmentation annotations. | |
targets_file_name: The name of the file the contains targets annotated for | |
each world. | |
output_size: Size of the output images. This is used for pre-processing the | |
loaded images. | |
Returns: | |
Dictionary of all the cached data. | |
""" | |
load_start = time.time() | |
result_data = {} | |
annotated_target_path = os.path.join(dataset_root, 'Meta', | |
targets_file_name + '.npy') | |
logging.info('loading targets: %s', annotated_target_path) | |
with tf.gfile.Open(annotated_target_path) as f: | |
result_data['targets'] = np.load(f).item() | |
depth_image_path = os.path.join(dataset_root, 'Meta/depth_imgs.npy') | |
logging.info('loading depth: %s', depth_image_path) | |
with tf.gfile.Open(depth_image_path) as f: | |
depth_data = np.load(f).item() | |
logging.info('processing depth') | |
for home_id in depth_data: | |
images = depth_data[home_id] | |
for image_id in images: | |
depth = images[image_id] | |
depth = cv2.resize( | |
depth / _MAX_DEPTH_VALUE, (output_size, output_size), | |
interpolation=cv2.INTER_NEAREST) | |
depth_mask = (depth > 0).astype(np.float32) | |
depth = np.dstack((depth, depth_mask)) | |
images[image_id] = depth | |
result_data[task_env.ModalityTypes.DEPTH] = depth_data | |
sseg_path = os.path.join(dataset_root, 'Meta', | |
segmentation_file_name + '.npy') | |
logging.info('loading sseg: %s', sseg_path) | |
with tf.gfile.Open(sseg_path) as f: | |
sseg_data = np.load(f).item() | |
logging.info('processing sseg') | |
for home_id in sseg_data: | |
images = sseg_data[home_id] | |
for image_id in images: | |
sseg = images[image_id] | |
sseg = cv2.resize( | |
sseg, (output_size, output_size), interpolation=cv2.INTER_NEAREST) | |
images[image_id] = np.expand_dims(sseg, axis=-1).astype(np.float32) | |
result_data[task_env.ModalityTypes.SEMANTIC_SEGMENTATION] = sseg_data | |
if should_load_images: | |
image_path = os.path.join(dataset_root, 'Meta/imgs.npy') | |
logging.info('loading imgs: %s', image_path) | |
with tf.gfile.Open(image_path) as f: | |
image_data = np.load(f).item() | |
result_data[task_env.ModalityTypes.IMAGE] = image_data | |
with tf.gfile.Open(os.path.join(dataset_root, 'Meta/world_id_dict.npy')) as f: | |
result_data['world_id_dict'] = np.load(f).item() | |
logging.info('logging done in %f seconds', time.time() - load_start) | |
return result_data | |
def get_spec_dtype_map(): | |
return {gym.spaces.Box: np.float32} | |
class ActiveVisionDatasetEnv(task_env.TaskEnv): | |
"""Simulates the environment from ActiveVisionDataset.""" | |
cached_data = None | |
def __init__( | |
self, | |
episode_length, | |
modality_types, | |
confidence_threshold, | |
output_size, | |
worlds, | |
targets, | |
compute_distance, | |
should_draw_detections, | |
dataset_root, | |
labelmap_path, | |
reward_collision, | |
reward_goal_range, | |
num_detection_classes, | |
segmentation_file_name, | |
detection_folder_name, | |
actions, | |
targets_file_name, | |
eval_init_points_file_name=None, | |
shaped_reward=False, | |
): | |
"""Instantiates the environment for ActiveVision Dataset. | |
Args: | |
episode_length: the length of each episode. | |
modality_types: a list of the strings where each entry indicates the name | |
of the modalities to be loaded. Valid entries are "sseg", "det", | |
"depth", "image", "distance", and "prev_action". "distance" should be | |
used for computing metrics in tf agents. | |
confidence_threshold: Consider detections more than confidence_threshold | |
for potential targets. | |
output_size: Resolution of the output image. | |
worlds: List of the name of the worlds. | |
targets: List of the target names. Each entry is a string label of the | |
target category (e.g. 'fridge', 'microwave', so on). | |
compute_distance: If True, outputs the distance of the view to the goal. | |
should_draw_detections (bool): If True, the image returned for the | |
observation will contains the bounding boxes. | |
dataset_root: the path to the root folder of the dataset. | |
labelmap_path: path to the dictionary that converts label strings to | |
indexes. | |
reward_collision: the reward the agents get after hitting an obstacle. | |
It should be a non-positive number. | |
reward_goal_range: the number of steps from goal, such that the agent is | |
considered to have reached the goal. If the agent's distance is less | |
than the specified goal range, the episode is also finishes by setting | |
done = True. | |
num_detection_classes: number of classes that detector outputs. | |
segmentation_file_name: the name of the file that contains the semantic | |
information. The file should be in the dataset_root/Meta/ folder. | |
detection_folder_name: Name of the folder that contains the detections | |
for each world. The folder should be under dataset_root/Meta/ folder. | |
actions: The list of the action names. Valid entries are listed in | |
SUPPORTED_ACTIONS. | |
targets_file_name: the name of the file that contains the annotated | |
targets. The file should be in the dataset_root/Meta/Folder | |
eval_init_points_file_name: The name of the file that contains the initial | |
points for evaluating the performance of the agent. If set to None, | |
episodes start at random locations. Should be only set for evaluation. | |
shaped_reward: Whether to add delta goal distance to the reward each step. | |
Raises: | |
ValueError: If one of the targets are not available in the annotated | |
targets or the modality names are not from the domain specified above. | |
ValueError: If one of the actions is not in SUPPORTED_ACTIONS. | |
ValueError: If the reward_collision is a positive number. | |
ValueError: If there is no action other than stop provided. | |
""" | |
if reward_collision > 0: | |
raise ValueError('"reward" for collision should be non positive') | |
if reward_goal_range < 0: | |
logging.warning('environment does not terminate the episode if the agent ' | |
'is too close to the environment') | |
if not modality_types: | |
raise ValueError('modality names can not be empty') | |
for name in modality_types: | |
if name not in SUPPORTED_MODALITIES: | |
raise ValueError('invalid modality type: {}'.format(name)) | |
actions_other_than_stop_found = False | |
for a in actions: | |
if a != 'stop': | |
actions_other_than_stop_found = True | |
if a not in SUPPORTED_ACTIONS: | |
raise ValueError('invalid action %s', a) | |
if not actions_other_than_stop_found: | |
raise ValueError('environment needs to have actions other than stop.') | |
super(ActiveVisionDatasetEnv, self).__init__() | |
self._episode_length = episode_length | |
self._modality_types = set(modality_types) | |
self._confidence_threshold = confidence_threshold | |
self._output_size = output_size | |
self._dataset_root = dataset_root | |
self._worlds = worlds | |
self._targets = targets | |
self._all_graph = {} | |
for world in self._worlds: | |
with tf.gfile.Open(_get_json_path(self._dataset_root, world), 'r') as f: | |
file_content = f.read() | |
file_content = file_content.replace('.jpg', '') | |
io = StringIO(file_content) | |
self._all_graph[world] = json.load(io) | |
self._cur_world = '' | |
self._cur_image_id = '' | |
self._cur_graph = None # Loaded by _update_graph | |
self._steps_taken = 0 | |
self._last_action_success = True | |
self._category_index = _init_category_index(labelmap_path) | |
self._category_map = dict( | |
[(c, i) for i, c in enumerate(self._category_index)]) | |
self._detection_cache = {} | |
if not ActiveVisionDatasetEnv.cached_data: | |
ActiveVisionDatasetEnv.cached_data = read_cached_data( | |
True, self._dataset_root, segmentation_file_name, targets_file_name, | |
self._output_size) | |
cached_data = ActiveVisionDatasetEnv.cached_data | |
self._world_id_dict = cached_data['world_id_dict'] | |
self._depth_images = cached_data[task_env.ModalityTypes.DEPTH] | |
self._semantic_segmentations = cached_data[ | |
task_env.ModalityTypes.SEMANTIC_SEGMENTATION] | |
self._annotated_targets = cached_data['targets'] | |
self._cached_imgs = cached_data[task_env.ModalityTypes.IMAGE] | |
self._graph_cache = {} | |
self._compute_distance = compute_distance | |
self._should_draw_detections = should_draw_detections | |
self._reward_collision = reward_collision | |
self._reward_goal_range = reward_goal_range | |
self._num_detection_classes = num_detection_classes | |
self._actions = actions | |
self._detection_folder_name = detection_folder_name | |
self._shaped_reward = shaped_reward | |
self._eval_init_points = None | |
if eval_init_points_file_name is not None: | |
self._eval_init_index = 0 | |
init_points_path = os.path.join(self._dataset_root, 'Meta', | |
eval_init_points_file_name + '.npy') | |
with tf.gfile.Open(init_points_path) as points_file: | |
data = np.load(points_file).item() | |
self._eval_init_points = [] | |
for world in self._worlds: | |
for goal in self._targets: | |
if world in self._annotated_targets[goal]: | |
for image_id in data[world]: | |
self._eval_init_points.append((world, image_id[0], goal)) | |
logging.info('loaded %d eval init points', len(self._eval_init_points)) | |
self.action_space = gym.spaces.Discrete(len(self._actions)) | |
obs_shapes = {} | |
if task_env.ModalityTypes.SEMANTIC_SEGMENTATION in self._modality_types: | |
obs_shapes[task_env.ModalityTypes.SEMANTIC_SEGMENTATION] = gym.spaces.Box( | |
low=0, high=255, shape=(self._output_size, self._output_size, 1)) | |
if task_env.ModalityTypes.OBJECT_DETECTION in self._modality_types: | |
obs_shapes[task_env.ModalityTypes.OBJECT_DETECTION] = gym.spaces.Box( | |
low=0, | |
high=255, | |
shape=(self._output_size, self._output_size, | |
self._num_detection_classes)) | |
if task_env.ModalityTypes.DEPTH in self._modality_types: | |
obs_shapes[task_env.ModalityTypes.DEPTH] = gym.spaces.Box( | |
low=0, | |
high=_MAX_DEPTH_VALUE, | |
shape=(self._output_size, self._output_size, 2)) | |
if task_env.ModalityTypes.IMAGE in self._modality_types: | |
obs_shapes[task_env.ModalityTypes.IMAGE] = gym.spaces.Box( | |
low=0, high=255, shape=(self._output_size, self._output_size, 3)) | |
if task_env.ModalityTypes.GOAL in self._modality_types: | |
obs_shapes[task_env.ModalityTypes.GOAL] = gym.spaces.Box( | |
low=0, high=1., shape=(len(self._targets),)) | |
if task_env.ModalityTypes.PREV_ACTION in self._modality_types: | |
obs_shapes[task_env.ModalityTypes.PREV_ACTION] = gym.spaces.Box( | |
low=0, high=1., shape=(len(self._actions) + 1,)) | |
if task_env.ModalityTypes.DISTANCE in self._modality_types: | |
obs_shapes[task_env.ModalityTypes.DISTANCE] = gym.spaces.Box( | |
low=0, high=255, shape=(1,)) | |
self.observation_space = gym.spaces.Dict(obs_shapes) | |
self._prev_action = np.zeros((len(self._actions) + 1), dtype=np.float32) | |
# Loading all the poses. | |
all_poses = {} | |
for world in self._worlds: | |
all_poses[world] = read_all_poses(self._dataset_root, world) | |
self._cached_poses = all_poses | |
self._vertex_to_pose = {} | |
self._pose_to_vertex = {} | |
def actions(self): | |
"""Returns list of actions for the env.""" | |
return self._actions | |
def _next_image(self, image_id, action): | |
"""Given the action, returns the name of the image that agent ends up in. | |
Args: | |
image_id: The image id of the current view. | |
action: valid actions are ['right', 'rotate_cw', 'rotate_ccw', | |
'forward', 'left']. Each rotation is 30 degrees. | |
Returns: | |
The image name for the next location of the agent. If the action results | |
in collision or it is not possible for the agent to execute that action, | |
returns empty string. | |
""" | |
assert action in self._actions, 'invalid action : {}'.format(action) | |
assert self._cur_world in self._all_graph, 'invalid world {}'.format( | |
self._cur_world) | |
assert image_id in self._all_graph[ | |
self._cur_world], 'image_id {} is not in {}'.format( | |
image_id, self._cur_world) | |
return self._all_graph[self._cur_world][image_id][action] | |
def _largest_detection_for_image(self, image_id, detections_dict): | |
"""Assigns area of the largest box for the view with given image id. | |
Args: | |
image_id: Image id of the view. | |
detections_dict: Detections for the view. | |
""" | |
for cls, box, score in zip(detections_dict['detection_classes'], | |
detections_dict['detection_boxes'], | |
detections_dict['detection_scores']): | |
if cls not in self._targets: | |
continue | |
if score < self._confidence_threshold: | |
continue | |
ymin, xmin, ymax, xmax = box | |
area = (ymax - ymin) * (xmax - xmin) | |
if abs(area) < 1e-5: | |
continue | |
if image_id not in self._detection_area: | |
self._detection_area[image_id] = area | |
else: | |
self._detection_area[image_id] = max(self._detection_area[image_id], | |
area) | |
def _compute_goal_indexes(self): | |
"""Computes the goal indexes for the environment. | |
Returns: | |
The indexes of the goals that are closest to target categories. A vertex | |
is goal vertice if the desired objects are detected in the image and the | |
target categories are not seen by moving forward from that vertice. | |
""" | |
for image_id in self._world_id_dict[self._cur_world]: | |
detections_dict = self._detection_table[image_id] | |
self._largest_detection_for_image(image_id, detections_dict) | |
goal_indexes = [] | |
for image_id in self._world_id_dict[self._cur_world]: | |
if image_id not in self._detection_area: | |
continue | |
# Detection box is large enough. | |
if self._detection_area[image_id] < 0.01: | |
continue | |
ok = True | |
next_image_id = self._next_image(image_id, 'forward') | |
if next_image_id: | |
if next_image_id in self._detection_area: | |
ok = False | |
if ok: | |
goal_indexes.append(self._cur_graph.id_to_index[image_id]) | |
return goal_indexes | |
def to_image_id(self, vid): | |
"""Converts vertex id to the image id. | |
Args: | |
vid: vertex id of the view. | |
Returns: | |
image id of the input vertex id. | |
""" | |
return self._cur_graph.index_to_id[vid] | |
def to_vertex(self, image_id): | |
return self._cur_graph.id_to_index[image_id] | |
def observation(self, view_pose): | |
"""Returns the observation at the given the vertex. | |
Args: | |
view_pose: pose of the view of interest. | |
Returns: | |
Observation at the given view point. | |
Raises: | |
ValueError: if the given view pose is not similar to any of the poses in | |
the current world. | |
""" | |
vertex = self.pose_to_vertex(view_pose) | |
if vertex is None: | |
raise ValueError('The given found is not close enough to any of the poses' | |
' in the environment.') | |
image_id = self._cur_graph.index_to_id[vertex] | |
output = collections.OrderedDict() | |
if task_env.ModalityTypes.SEMANTIC_SEGMENTATION in self._modality_types: | |
output[task_env.ModalityTypes. | |
SEMANTIC_SEGMENTATION] = self._semantic_segmentations[ | |
self._cur_world][image_id] | |
detection = None | |
need_det = ( | |
task_env.ModalityTypes.OBJECT_DETECTION in self._modality_types or | |
(task_env.ModalityTypes.IMAGE in self._modality_types and | |
self._should_draw_detections)) | |
if need_det: | |
detection = self._detection_table[image_id] | |
detection_image = generate_detection_image( | |
detection, | |
self._output_size, | |
self._category_map, | |
num_classes=self._num_detection_classes) | |
if task_env.ModalityTypes.OBJECT_DETECTION in self._modality_types: | |
output[task_env.ModalityTypes.OBJECT_DETECTION] = detection_image | |
if task_env.ModalityTypes.DEPTH in self._modality_types: | |
output[task_env.ModalityTypes.DEPTH] = self._depth_images[ | |
self._cur_world][image_id] | |
if task_env.ModalityTypes.IMAGE in self._modality_types: | |
output_img = self._cached_imgs[self._cur_world][image_id] | |
if self._should_draw_detections: | |
output_img = output_img.copy() | |
_draw_detections(output_img, detection, self._category_index) | |
output[task_env.ModalityTypes.IMAGE] = output_img | |
if task_env.ModalityTypes.GOAL in self._modality_types: | |
goal = np.zeros((len(self._targets),), dtype=np.float32) | |
goal[self._targets.index(self._cur_goal)] = 1. | |
output[task_env.ModalityTypes.GOAL] = goal | |
if task_env.ModalityTypes.PREV_ACTION in self._modality_types: | |
output[task_env.ModalityTypes.PREV_ACTION] = self._prev_action | |
if task_env.ModalityTypes.DISTANCE in self._modality_types: | |
output[task_env.ModalityTypes.DISTANCE] = np.asarray( | |
[self.gt_value(self._cur_goal, vertex)], dtype=np.float32) | |
return output | |
def _step_no_reward(self, action): | |
"""Performs a step in the environment with given action. | |
Args: | |
action: Action that is used to step in the environment. Action can be | |
string or integer. If the type is integer then it uses the ith element | |
from self._actions list. Otherwise, uses the string value as the action. | |
Returns: | |
observation, done, info | |
observation: dictonary that contains all the observations specified in | |
modality_types. | |
observation[task_env.ModalityTypes.OBJECT_DETECTION]: contains the | |
detection of the current view. | |
observation[task_env.ModalityTypes.IMAGE]: contains the | |
image of the current view. Note that if using the images for training, | |
should_load_images should be set to false. | |
observation[task_env.ModalityTypes.SEMANTIC_SEGMENTATION]: contains the | |
semantic segmentation of the current view. | |
observation[task_env.ModalityTypes.DEPTH]: If selected, returns the | |
depth map for the current view. | |
observation[task_env.ModalityTypes.PREV_ACTION]: If selected, returns | |
a numpy of (action_size + 1,). The first action_size elements indicate | |
the action and the last element indicates whether the previous action | |
was successful or not. | |
done: True after episode_length steps have been taken, False otherwise. | |
info: Empty dictionary. | |
Raises: | |
ValueError: for invalid actions. | |
""" | |
# Primarily used for gym interface. | |
if not isinstance(action, str): | |
if not self.action_space.contains(action): | |
raise ValueError('Not a valid actions: %d', action) | |
action = self._actions[action] | |
if action not in self._actions: | |
raise ValueError('Not a valid action: %s', action) | |
action_index = self._actions.index(action) | |
if action == 'stop': | |
next_image_id = self._cur_image_id | |
done = True | |
success = True | |
else: | |
next_image_id = self._next_image(self._cur_image_id, action) | |
self._steps_taken += 1 | |
done = False | |
success = True | |
if not next_image_id: | |
success = False | |
else: | |
self._cur_image_id = next_image_id | |
if self._steps_taken >= self._episode_length: | |
done = True | |
cur_vertex = self._cur_graph.id_to_index[self._cur_image_id] | |
observation = self.observation(self.vertex_to_pose(cur_vertex)) | |
# Concatenation of one-hot prev action + a binary number for success of | |
# previous actions. | |
self._prev_action = np.zeros((len(self._actions) + 1,), dtype=np.float32) | |
self._prev_action[action_index] = 1. | |
self._prev_action[-1] = float(success) | |
distance_to_goal = self.gt_value(self._cur_goal, cur_vertex) | |
if success: | |
if distance_to_goal <= self._reward_goal_range: | |
done = True | |
return observation, done, {'success': success} | |
def graph(self): | |
return self._cur_graph.graph | |
def state(self): | |
return self.vertex_to_pose(self.to_vertex(self._cur_image_id)) | |
def gt_value(self, goal, v): | |
"""Computes the distance to the goal from vertex v. | |
Args: | |
goal: name of the goal. | |
v: vertex id. | |
Returns: | |
Minimmum number of steps to the given goal. | |
""" | |
assert goal in self._cur_graph.distance_to_goal, 'goal: {}'.format(goal) | |
assert v in self._cur_graph.distance_to_goal[goal] | |
res = self._cur_graph.distance_to_goal[goal][v] | |
return res | |
def _update_graph(self): | |
"""Creates the graph for each environment and updates the _cur_graph.""" | |
if self._cur_world not in self._graph_cache: | |
graph = nx.DiGraph() | |
id_to_index = {} | |
index_to_id = {} | |
image_list = self._world_id_dict[self._cur_world] | |
for i, image_id in enumerate(image_list): | |
id_to_index[image_id] = i | |
index_to_id[i] = image_id | |
graph.add_node(i) | |
for image_id in image_list: | |
for action in self._actions: | |
if action == 'stop': | |
continue | |
next_image = self._all_graph[self._cur_world][image_id][action] | |
if next_image: | |
graph.add_edge( | |
id_to_index[image_id], id_to_index[next_image], action=action) | |
target_indexes = {} | |
number_of_nodes_without_targets = graph.number_of_nodes() | |
distance_to_goal = {} | |
for goal in self._targets: | |
if self._cur_world not in self._annotated_targets[goal]: | |
continue | |
goal_indexes = [ | |
id_to_index[i] | |
for i in self._annotated_targets[goal][self._cur_world] | |
if i | |
] | |
super_source_index = graph.number_of_nodes() | |
target_indexes[goal] = super_source_index | |
graph.add_node(super_source_index) | |
index_to_id[super_source_index] = goal | |
id_to_index[goal] = super_source_index | |
for v in goal_indexes: | |
graph.add_edge(v, super_source_index, action='stop') | |
graph.add_edge(super_source_index, v, action='stop') | |
distance_to_goal[goal] = {} | |
for v in range(number_of_nodes_without_targets): | |
distance_to_goal[goal][v] = len( | |
nx.shortest_path(graph, v, super_source_index)) - 2 | |
self._graph_cache[self._cur_world] = _Graph( | |
graph, id_to_index, index_to_id, target_indexes, distance_to_goal) | |
self._cur_graph = self._graph_cache[self._cur_world] | |
def reset_for_eval(self, new_world, new_goal, new_image_id): | |
"""Resets to the given goal and image_id.""" | |
return self._reset_env(new_world=new_world, new_goal=new_goal, new_image_id=new_image_id) | |
def get_init_config(self, path): | |
"""Exposes the initial state of the agent for the given path. | |
Args: | |
path: sequences of the vertexes that the agent moves. | |
Returns: | |
image_id of the first view, world, and the goal. | |
""" | |
return self._cur_graph.index_to_id[path[0]], self._cur_world, self._cur_goal | |
def _reset_env( | |
self, | |
new_world=None, | |
new_goal=None, | |
new_image_id=None, | |
): | |
"""Resets the agent in a random world and random id. | |
Args: | |
new_world: If not None, sets the new world to new_world. | |
new_goal: If not None, sets the new goal to new_goal. | |
new_image_id: If not None, sets the first image id to new_image_id. | |
Returns: | |
observation: dictionary of the observations. Content of the observation | |
is similar to that of the step function. | |
Raises: | |
ValueError: if it can't find a world and annotated goal. | |
""" | |
self._steps_taken = 0 | |
# The first prev_action is special all zero vector + success=1. | |
self._prev_action = np.zeros((len(self._actions) + 1,), dtype=np.float32) | |
self._prev_action[len(self._actions)] = 1. | |
if self._eval_init_points is not None: | |
if self._eval_init_index >= len(self._eval_init_points): | |
self._eval_init_index = 0 | |
a = self._eval_init_points[self._eval_init_index] | |
self._cur_world, self._cur_image_id, self._cur_goal = a | |
self._eval_init_index += 1 | |
elif not new_world: | |
attempts = 100 | |
found = False | |
while attempts >= 0: | |
attempts -= 1 | |
self._cur_goal = np.random.choice(self._targets) | |
available_worlds = list( | |
set(self._annotated_targets[self._cur_goal].keys()).intersection( | |
set(self._worlds))) | |
if available_worlds: | |
found = True | |
break | |
if not found: | |
raise ValueError('could not find a world that has a target annotated') | |
self._cur_world = np.random.choice(available_worlds) | |
else: | |
self._cur_world = new_world | |
self._cur_goal = new_goal | |
if new_world not in self._annotated_targets[new_goal]: | |
return None | |
self._cur_goal_index = self._targets.index(self._cur_goal) | |
if new_image_id: | |
self._cur_image_id = new_image_id | |
else: | |
self._cur_image_id = np.random.choice( | |
self._world_id_dict[self._cur_world]) | |
if self._cur_world not in self._detection_cache: | |
with tf.gfile.Open( | |
_get_detection_path(self._dataset_root, self._detection_folder_name, | |
self._cur_world)) as f: | |
# Each file contains a dictionary with image ids as keys and detection | |
# dicts as values. | |
self._detection_cache[self._cur_world] = np.load(f).item() | |
self._detection_table = self._detection_cache[self._cur_world] | |
self._detection_area = {} | |
self._update_graph() | |
if self._cur_world not in self._vertex_to_pose: | |
# adding fake pose for the super nodes of each target categories. | |
self._vertex_to_pose[self._cur_world] = { | |
index: (-index,) for index in self._cur_graph.target_indexes.values() | |
} | |
# Calling vetex_to_pose for each vertex results in filling out the | |
# dictionaries that contain pose related data. | |
for image_id in self._world_id_dict[self._cur_world]: | |
self.vertex_to_pose(self.to_vertex(image_id)) | |
# Filling out pose_to_vertex from vertex_to_pose. | |
self._pose_to_vertex[self._cur_world] = { | |
tuple(v): k | |
for k, v in self._vertex_to_pose[self._cur_world].iteritems() | |
} | |
cur_vertex = self._cur_graph.id_to_index[self._cur_image_id] | |
observation = self.observation(self.vertex_to_pose(cur_vertex)) | |
return observation | |
def cur_vertex(self): | |
return self._cur_graph.id_to_index[self._cur_image_id] | |
def cur_image_id(self): | |
return self._cur_image_id | |
def path_to_goal(self, image_id=None): | |
"""Returns the path from image_id to the self._cur_goal. | |
Args: | |
image_id: If set to None, computes the path from the current view. | |
Otherwise, sets the current view to the given image_id. | |
Returns: | |
The path to the goal. | |
Raises: | |
Exception if there's no path from the view to the goal. | |
""" | |
if image_id is None: | |
image_id = self._cur_image_id | |
super_source = self._cur_graph.target_indexes[self._cur_goal] | |
try: | |
path = nx.shortest_path(self._cur_graph.graph, | |
self._cur_graph.id_to_index[image_id], | |
super_source) | |
except: | |
print 'path not found, image_id = ', self._cur_world, self._cur_image_id | |
raise | |
return path[:-1] | |
def targets(self): | |
return [self.vertex_to_pose(self._cur_graph.target_indexes[self._cur_goal])] | |
def vertex_to_pose(self, v): | |
"""Returns pose of the view for a given vertex. | |
Args: | |
v: integer, vertex index. | |
Returns: | |
(x, z, dir_x, dir_z) where x and z are the tranlation and dir_x, dir_z are | |
a vector giving direction of the view. | |
""" | |
if v in self._vertex_to_pose[self._cur_world]: | |
return np.copy(self._vertex_to_pose[self._cur_world][v]) | |
x, z, rot, scale = self._cached_poses[self._cur_world][self.to_image_id( | |
v)] | |
if rot is None: # if rotation is not provided for the given vertex. | |
self._vertex_to_pose[self._cur_world][v] = np.asarray( | |
[x * scale, z * scale, v]) | |
return np.copy(self._vertex_to_pose[self._cur_world][v]) | |
# Multiply rotation matrix by [0,0,1] to get a vector of length 1 in the | |
# direction of the ray. | |
direction = np.zeros((3, 1), dtype=np.float32) | |
direction[2][0] = 1 | |
direction = np.matmul(np.transpose(rot), direction) | |
direction = [direction[0][0], direction[2][0]] | |
self._vertex_to_pose[self._cur_world][v] = np.asarray( | |
[x * scale, z * scale, direction[0], direction[1]]) | |
return np.copy(self._vertex_to_pose[self._cur_world][v]) | |
def pose_to_vertex(self, pose): | |
"""Returns the vertex id for the given pose.""" | |
if tuple(pose) not in self._pose_to_vertex[self._cur_world]: | |
raise ValueError( | |
'The given pose is not present in the dictionary: {}'.format( | |
tuple(pose))) | |
return self._pose_to_vertex[self._cur_world][tuple(pose)] | |
def check_scene_graph(self, world, goal): | |
"""Checks the connectivity of the scene graph. | |
Goes over all the views. computes the shortest path to the goal. If it | |
crashes it means that it's not connected. Otherwise, the env graph is fine. | |
Args: | |
world: the string name of the world. | |
goal: the string label for the goal. | |
Returns: | |
Nothing. | |
""" | |
obs = self._reset_env(new_world=world, new_goal=goal) | |
if not obs: | |
print '{} is not availble in {}'.format(goal, world) | |
return True | |
for image_id in self._world_id_dict[self._cur_world]: | |
print 'check image_id = {}'.format(image_id) | |
self._cur_image_id = image_id | |
path = self.path_to_goal() | |
actions = [] | |
for i in range(len(path) - 2): | |
actions.append(self.action(path[i], path[i + 1])) | |
actions.append('stop') | |
def goal_one_hot(self): | |
res = np.zeros((len(self._targets),), dtype=np.float32) | |
res[self._cur_goal_index] = 1. | |
return res | |
def goal_index(self): | |
return self._cur_goal_index | |
def goal_string(self): | |
return self._cur_goal | |
def worlds(self): | |
return self._worlds | |
def possible_targets(self): | |
return self._targets | |
def action(self, from_pose, to_pose): | |
"""Returns the action that takes source vertex to destination vertex. | |
Args: | |
from_pose: pose of the source. | |
to_pose: pose of the destination. | |
Returns: | |
Returns the index of the action. | |
Raises: | |
ValueError: If it is not possible to go from the first vertice to second | |
vertice with one action, it raises value error. | |
""" | |
from_index = self.pose_to_vertex(from_pose) | |
to_index = self.pose_to_vertex(to_pose) | |
if to_index not in self.graph[from_index]: | |
from_image_id = self.to_image_id(from_index) | |
to_image_id = self.to_image_id(to_index) | |
raise ValueError('{},{} is not connected to {},{}'.format( | |
from_index, from_image_id, to_index, to_image_id)) | |
return self._actions.index(self.graph[from_index][to_index]['action']) | |
def random_step_sequence(self, min_len=None, max_len=None): | |
"""Generates random step sequence that takes agent to the goal. | |
Args: | |
min_len: integer, minimum length of a step sequence. Not yet implemented. | |
max_len: integer, should be set to an integer and it is the maximum number | |
of observations and path length to be max_len. | |
Returns: | |
Tuple of (path, actions, states, step_outputs). | |
path: a random path from a random starting point and random environment. | |
actions: actions of the returned path. | |
states: viewpoints of all the states in between. | |
step_outputs: list of step() return tuples. | |
Raises: | |
ValueError: if first_n is not greater than zero; if min_len is different | |
from None. | |
""" | |
if max_len is None: | |
raise ValueError('max_len can not be set as None') | |
if max_len < 1: | |
raise ValueError('first_n must be greater or equal to 1.') | |
if min_len is not None: | |
raise ValueError('min_len is not yet implemented.') | |
path = [] | |
actions = [] | |
states = [] | |
step_outputs = [] | |
obs = self.reset() | |
last_obs_tuple = [obs, 0, False, {}] | |
for _ in xrange(max_len): | |
action = np.random.choice(self._actions) | |
# We don't want to sample stop action because stop does not add new | |
# information. | |
while action == 'stop': | |
action = np.random.choice(self._actions) | |
path.append(self.to_vertex(self._cur_image_id)) | |
onehot = np.zeros((len(self._actions),), dtype=np.float32) | |
onehot[self._actions.index(action)] = 1. | |
actions.append(onehot) | |
states.append(self.vertex_to_pose(path[-1])) | |
step_outputs.append(copy.deepcopy(last_obs_tuple)) | |
last_obs_tuple = self.step(action) | |
return path, actions, states, step_outputs | |