Spaces:
Running
Running
File size: 41,226 Bytes
0b8359d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 |
# Copyright 2018 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Gym environment for the ActiveVision Dataset.
The dataset is captured with a robot moving around and taking picture in
multiple directions. The actions are moving in four directions, and rotate
clockwise or counter clockwise. The observations are the output of vision
pipelines such as object detectors. The goal is to find objects of interest
in each environment. For more details, refer:
http://cs.unc.edu/~ammirato/active_vision_dataset_website/.
"""
import tensorflow as tf
import collections
import copy
import json
import os
from StringIO import StringIO
import time
import gym
from gym.envs.registration import register
import gym.spaces
import networkx as nx
import numpy as np
import scipy.io as sio
from absl import logging
import gin
import cv2
import label_map_util
import visualization_utils as vis_util
from envs import task_env
register(
id='active-vision-env-v0',
entry_point=
'cognitive_planning.envs.active_vision_dataset_env:ActiveVisionDatasetEnv', # pylint: disable=line-too-long
)
_MAX_DEPTH_VALUE = 12102
SUPPORTED_ACTIONS = [
'right', 'rotate_cw', 'rotate_ccw', 'forward', 'left', 'backward', 'stop'
]
SUPPORTED_MODALITIES = [
task_env.ModalityTypes.SEMANTIC_SEGMENTATION,
task_env.ModalityTypes.DEPTH,
task_env.ModalityTypes.OBJECT_DETECTION,
task_env.ModalityTypes.IMAGE,
task_env.ModalityTypes.GOAL,
task_env.ModalityTypes.PREV_ACTION,
task_env.ModalityTypes.DISTANCE,
]
# Data structure for storing the information related to the graph of the world.
_Graph = collections.namedtuple('_Graph', [
'graph', 'id_to_index', 'index_to_id', 'target_indexes', 'distance_to_goal'
])
def _init_category_index(label_map_path):
"""Creates category index from class indexes to name of the classes.
Args:
label_map_path: path to the mapping.
Returns:
A map for mapping int keys to string categories.
"""
label_map = label_map_util.load_labelmap(label_map_path)
num_classes = np.max(x.id for x in label_map.item)
categories = label_map_util.convert_label_map_to_categories(
label_map, max_num_classes=num_classes, use_display_name=True)
category_index = label_map_util.create_category_index(categories)
return category_index
def _draw_detections(image_np, detections, category_index):
"""Draws detections on to the image.
Args:
image_np: Image in the form of uint8 numpy array.
detections: a dictionary that contains the detection outputs.
category_index: contains the mapping between indexes and the category names.
Returns:
Does not return anything but draws the boxes on the
"""
vis_util.visualize_boxes_and_labels_on_image_array(
image_np,
detections['detection_boxes'],
detections['detection_classes'],
detections['detection_scores'],
category_index,
use_normalized_coordinates=True,
max_boxes_to_draw=1000,
min_score_thresh=.0,
agnostic_mode=False)
def generate_detection_image(detections,
image_size,
category_map,
num_classes,
is_binary=True):
"""Generates one_hot vector of the image using the detection boxes.
Args:
detections: 2D object detections from the image. It's a dictionary that
contains detection_boxes, detection_classes, and detection_scores with
dimensions of nx4, nx1, nx1 where n is the number of detections.
image_size: The resolution of the output image.
category_map: dictionary that maps label names to index.
num_classes: Number of classes.
is_binary: If true, it sets the corresponding channels to 0 and 1.
Otherwise, sets the score in the corresponding channel.
Returns:
Returns image_size x image_size x num_classes image for the detection boxes.
"""
res = np.zeros((image_size, image_size, num_classes), dtype=np.float32)
boxes = detections['detection_boxes']
labels = detections['detection_classes']
scores = detections['detection_scores']
for box, label, score in zip(boxes, labels, scores):
transformed_boxes = [int(round(t)) for t in box * image_size]
y1, x1, y2, x2 = transformed_boxes
# Detector returns fixed number of detections. Boxes with area of zero
# are equivalent of boxes that don't correspond to any detection box.
# So, we need to skip the boxes with area 0.
if (y2 - y1) * (x2 - x1) == 0:
continue
assert category_map[label] < num_classes, 'label = {}'.format(label)
value = score
if is_binary:
value = 1
res[y1:y2, x1:x2, category_map[label]] = value
return res
def _get_detection_path(root, detection_folder_name, world):
return os.path.join(root, 'Meta', detection_folder_name, world + '.npy')
def _get_image_folder(root, world):
return os.path.join(root, world, 'jpg_rgb')
def _get_json_path(root, world):
return os.path.join(root, world, 'annotations.json')
def _get_image_path(root, world, image_id):
return os.path.join(_get_image_folder(root, world), image_id + '.jpg')
def _get_image_list(path, worlds):
"""Builds a dictionary for all the worlds.
Args:
path: the path to the dataset on cns.
worlds: list of the worlds.
Returns:
dictionary where the key is the world names and the values
are the image_ids of that world.
"""
world_id_dict = {}
for loc in worlds:
files = [t[:-4] for t in tf.gfile.ListDir(_get_image_folder(path, loc))]
world_id_dict[loc] = files
return world_id_dict
def read_all_poses(dataset_root, world):
"""Reads all the poses for each world.
Args:
dataset_root: the path to the root of the dataset.
world: string, name of the world.
Returns:
Dictionary of poses for all the images in each world. The key is the image
id of each view and the values are tuple of (x, z, R, scale). Where x and z
are the first and third coordinate of translation. R is the 3x3 rotation
matrix and scale is a float scalar that indicates the scale that needs to
be multipled to x and z in order to get the real world coordinates.
Raises:
ValueError: if the number of images do not match the number of poses read.
"""
path = os.path.join(dataset_root, world, 'image_structs.mat')
with tf.gfile.Open(path) as f:
data = sio.loadmat(f)
xyz = data['image_structs']['world_pos']
image_names = data['image_structs']['image_name'][0]
rot = data['image_structs']['R'][0]
scale = data['scale'][0][0]
n = xyz.shape[1]
x = [xyz[0][i][0][0] for i in range(n)]
z = [xyz[0][i][2][0] for i in range(n)]
names = [name[0][:-4] for name in image_names]
if len(names) != len(x):
raise ValueError('number of image names are not equal to the number of '
'poses {} != {}'.format(len(names), len(x)))
output = {}
for i in range(n):
if rot[i].shape[0] != 0:
assert rot[i].shape[0] == 3
assert rot[i].shape[1] == 3
output[names[i]] = (x[i], z[i], rot[i], scale)
else:
output[names[i]] = (x[i], z[i], None, scale)
return output
def read_cached_data(should_load_images, dataset_root, segmentation_file_name,
targets_file_name, output_size):
"""Reads all the necessary cached data.
Args:
should_load_images: whether to load the images or not.
dataset_root: path to the root of the dataset.
segmentation_file_name: The name of the file that contains semantic
segmentation annotations.
targets_file_name: The name of the file the contains targets annotated for
each world.
output_size: Size of the output images. This is used for pre-processing the
loaded images.
Returns:
Dictionary of all the cached data.
"""
load_start = time.time()
result_data = {}
annotated_target_path = os.path.join(dataset_root, 'Meta',
targets_file_name + '.npy')
logging.info('loading targets: %s', annotated_target_path)
with tf.gfile.Open(annotated_target_path) as f:
result_data['targets'] = np.load(f).item()
depth_image_path = os.path.join(dataset_root, 'Meta/depth_imgs.npy')
logging.info('loading depth: %s', depth_image_path)
with tf.gfile.Open(depth_image_path) as f:
depth_data = np.load(f).item()
logging.info('processing depth')
for home_id in depth_data:
images = depth_data[home_id]
for image_id in images:
depth = images[image_id]
depth = cv2.resize(
depth / _MAX_DEPTH_VALUE, (output_size, output_size),
interpolation=cv2.INTER_NEAREST)
depth_mask = (depth > 0).astype(np.float32)
depth = np.dstack((depth, depth_mask))
images[image_id] = depth
result_data[task_env.ModalityTypes.DEPTH] = depth_data
sseg_path = os.path.join(dataset_root, 'Meta',
segmentation_file_name + '.npy')
logging.info('loading sseg: %s', sseg_path)
with tf.gfile.Open(sseg_path) as f:
sseg_data = np.load(f).item()
logging.info('processing sseg')
for home_id in sseg_data:
images = sseg_data[home_id]
for image_id in images:
sseg = images[image_id]
sseg = cv2.resize(
sseg, (output_size, output_size), interpolation=cv2.INTER_NEAREST)
images[image_id] = np.expand_dims(sseg, axis=-1).astype(np.float32)
result_data[task_env.ModalityTypes.SEMANTIC_SEGMENTATION] = sseg_data
if should_load_images:
image_path = os.path.join(dataset_root, 'Meta/imgs.npy')
logging.info('loading imgs: %s', image_path)
with tf.gfile.Open(image_path) as f:
image_data = np.load(f).item()
result_data[task_env.ModalityTypes.IMAGE] = image_data
with tf.gfile.Open(os.path.join(dataset_root, 'Meta/world_id_dict.npy')) as f:
result_data['world_id_dict'] = np.load(f).item()
logging.info('logging done in %f seconds', time.time() - load_start)
return result_data
@gin.configurable
def get_spec_dtype_map():
return {gym.spaces.Box: np.float32}
@gin.configurable
class ActiveVisionDatasetEnv(task_env.TaskEnv):
"""Simulates the environment from ActiveVisionDataset."""
cached_data = None
def __init__(
self,
episode_length,
modality_types,
confidence_threshold,
output_size,
worlds,
targets,
compute_distance,
should_draw_detections,
dataset_root,
labelmap_path,
reward_collision,
reward_goal_range,
num_detection_classes,
segmentation_file_name,
detection_folder_name,
actions,
targets_file_name,
eval_init_points_file_name=None,
shaped_reward=False,
):
"""Instantiates the environment for ActiveVision Dataset.
Args:
episode_length: the length of each episode.
modality_types: a list of the strings where each entry indicates the name
of the modalities to be loaded. Valid entries are "sseg", "det",
"depth", "image", "distance", and "prev_action". "distance" should be
used for computing metrics in tf agents.
confidence_threshold: Consider detections more than confidence_threshold
for potential targets.
output_size: Resolution of the output image.
worlds: List of the name of the worlds.
targets: List of the target names. Each entry is a string label of the
target category (e.g. 'fridge', 'microwave', so on).
compute_distance: If True, outputs the distance of the view to the goal.
should_draw_detections (bool): If True, the image returned for the
observation will contains the bounding boxes.
dataset_root: the path to the root folder of the dataset.
labelmap_path: path to the dictionary that converts label strings to
indexes.
reward_collision: the reward the agents get after hitting an obstacle.
It should be a non-positive number.
reward_goal_range: the number of steps from goal, such that the agent is
considered to have reached the goal. If the agent's distance is less
than the specified goal range, the episode is also finishes by setting
done = True.
num_detection_classes: number of classes that detector outputs.
segmentation_file_name: the name of the file that contains the semantic
information. The file should be in the dataset_root/Meta/ folder.
detection_folder_name: Name of the folder that contains the detections
for each world. The folder should be under dataset_root/Meta/ folder.
actions: The list of the action names. Valid entries are listed in
SUPPORTED_ACTIONS.
targets_file_name: the name of the file that contains the annotated
targets. The file should be in the dataset_root/Meta/Folder
eval_init_points_file_name: The name of the file that contains the initial
points for evaluating the performance of the agent. If set to None,
episodes start at random locations. Should be only set for evaluation.
shaped_reward: Whether to add delta goal distance to the reward each step.
Raises:
ValueError: If one of the targets are not available in the annotated
targets or the modality names are not from the domain specified above.
ValueError: If one of the actions is not in SUPPORTED_ACTIONS.
ValueError: If the reward_collision is a positive number.
ValueError: If there is no action other than stop provided.
"""
if reward_collision > 0:
raise ValueError('"reward" for collision should be non positive')
if reward_goal_range < 0:
logging.warning('environment does not terminate the episode if the agent '
'is too close to the environment')
if not modality_types:
raise ValueError('modality names can not be empty')
for name in modality_types:
if name not in SUPPORTED_MODALITIES:
raise ValueError('invalid modality type: {}'.format(name))
actions_other_than_stop_found = False
for a in actions:
if a != 'stop':
actions_other_than_stop_found = True
if a not in SUPPORTED_ACTIONS:
raise ValueError('invalid action %s', a)
if not actions_other_than_stop_found:
raise ValueError('environment needs to have actions other than stop.')
super(ActiveVisionDatasetEnv, self).__init__()
self._episode_length = episode_length
self._modality_types = set(modality_types)
self._confidence_threshold = confidence_threshold
self._output_size = output_size
self._dataset_root = dataset_root
self._worlds = worlds
self._targets = targets
self._all_graph = {}
for world in self._worlds:
with tf.gfile.Open(_get_json_path(self._dataset_root, world), 'r') as f:
file_content = f.read()
file_content = file_content.replace('.jpg', '')
io = StringIO(file_content)
self._all_graph[world] = json.load(io)
self._cur_world = ''
self._cur_image_id = ''
self._cur_graph = None # Loaded by _update_graph
self._steps_taken = 0
self._last_action_success = True
self._category_index = _init_category_index(labelmap_path)
self._category_map = dict(
[(c, i) for i, c in enumerate(self._category_index)])
self._detection_cache = {}
if not ActiveVisionDatasetEnv.cached_data:
ActiveVisionDatasetEnv.cached_data = read_cached_data(
True, self._dataset_root, segmentation_file_name, targets_file_name,
self._output_size)
cached_data = ActiveVisionDatasetEnv.cached_data
self._world_id_dict = cached_data['world_id_dict']
self._depth_images = cached_data[task_env.ModalityTypes.DEPTH]
self._semantic_segmentations = cached_data[
task_env.ModalityTypes.SEMANTIC_SEGMENTATION]
self._annotated_targets = cached_data['targets']
self._cached_imgs = cached_data[task_env.ModalityTypes.IMAGE]
self._graph_cache = {}
self._compute_distance = compute_distance
self._should_draw_detections = should_draw_detections
self._reward_collision = reward_collision
self._reward_goal_range = reward_goal_range
self._num_detection_classes = num_detection_classes
self._actions = actions
self._detection_folder_name = detection_folder_name
self._shaped_reward = shaped_reward
self._eval_init_points = None
if eval_init_points_file_name is not None:
self._eval_init_index = 0
init_points_path = os.path.join(self._dataset_root, 'Meta',
eval_init_points_file_name + '.npy')
with tf.gfile.Open(init_points_path) as points_file:
data = np.load(points_file).item()
self._eval_init_points = []
for world in self._worlds:
for goal in self._targets:
if world in self._annotated_targets[goal]:
for image_id in data[world]:
self._eval_init_points.append((world, image_id[0], goal))
logging.info('loaded %d eval init points', len(self._eval_init_points))
self.action_space = gym.spaces.Discrete(len(self._actions))
obs_shapes = {}
if task_env.ModalityTypes.SEMANTIC_SEGMENTATION in self._modality_types:
obs_shapes[task_env.ModalityTypes.SEMANTIC_SEGMENTATION] = gym.spaces.Box(
low=0, high=255, shape=(self._output_size, self._output_size, 1))
if task_env.ModalityTypes.OBJECT_DETECTION in self._modality_types:
obs_shapes[task_env.ModalityTypes.OBJECT_DETECTION] = gym.spaces.Box(
low=0,
high=255,
shape=(self._output_size, self._output_size,
self._num_detection_classes))
if task_env.ModalityTypes.DEPTH in self._modality_types:
obs_shapes[task_env.ModalityTypes.DEPTH] = gym.spaces.Box(
low=0,
high=_MAX_DEPTH_VALUE,
shape=(self._output_size, self._output_size, 2))
if task_env.ModalityTypes.IMAGE in self._modality_types:
obs_shapes[task_env.ModalityTypes.IMAGE] = gym.spaces.Box(
low=0, high=255, shape=(self._output_size, self._output_size, 3))
if task_env.ModalityTypes.GOAL in self._modality_types:
obs_shapes[task_env.ModalityTypes.GOAL] = gym.spaces.Box(
low=0, high=1., shape=(len(self._targets),))
if task_env.ModalityTypes.PREV_ACTION in self._modality_types:
obs_shapes[task_env.ModalityTypes.PREV_ACTION] = gym.spaces.Box(
low=0, high=1., shape=(len(self._actions) + 1,))
if task_env.ModalityTypes.DISTANCE in self._modality_types:
obs_shapes[task_env.ModalityTypes.DISTANCE] = gym.spaces.Box(
low=0, high=255, shape=(1,))
self.observation_space = gym.spaces.Dict(obs_shapes)
self._prev_action = np.zeros((len(self._actions) + 1), dtype=np.float32)
# Loading all the poses.
all_poses = {}
for world in self._worlds:
all_poses[world] = read_all_poses(self._dataset_root, world)
self._cached_poses = all_poses
self._vertex_to_pose = {}
self._pose_to_vertex = {}
@property
def actions(self):
"""Returns list of actions for the env."""
return self._actions
def _next_image(self, image_id, action):
"""Given the action, returns the name of the image that agent ends up in.
Args:
image_id: The image id of the current view.
action: valid actions are ['right', 'rotate_cw', 'rotate_ccw',
'forward', 'left']. Each rotation is 30 degrees.
Returns:
The image name for the next location of the agent. If the action results
in collision or it is not possible for the agent to execute that action,
returns empty string.
"""
assert action in self._actions, 'invalid action : {}'.format(action)
assert self._cur_world in self._all_graph, 'invalid world {}'.format(
self._cur_world)
assert image_id in self._all_graph[
self._cur_world], 'image_id {} is not in {}'.format(
image_id, self._cur_world)
return self._all_graph[self._cur_world][image_id][action]
def _largest_detection_for_image(self, image_id, detections_dict):
"""Assigns area of the largest box for the view with given image id.
Args:
image_id: Image id of the view.
detections_dict: Detections for the view.
"""
for cls, box, score in zip(detections_dict['detection_classes'],
detections_dict['detection_boxes'],
detections_dict['detection_scores']):
if cls not in self._targets:
continue
if score < self._confidence_threshold:
continue
ymin, xmin, ymax, xmax = box
area = (ymax - ymin) * (xmax - xmin)
if abs(area) < 1e-5:
continue
if image_id not in self._detection_area:
self._detection_area[image_id] = area
else:
self._detection_area[image_id] = max(self._detection_area[image_id],
area)
def _compute_goal_indexes(self):
"""Computes the goal indexes for the environment.
Returns:
The indexes of the goals that are closest to target categories. A vertex
is goal vertice if the desired objects are detected in the image and the
target categories are not seen by moving forward from that vertice.
"""
for image_id in self._world_id_dict[self._cur_world]:
detections_dict = self._detection_table[image_id]
self._largest_detection_for_image(image_id, detections_dict)
goal_indexes = []
for image_id in self._world_id_dict[self._cur_world]:
if image_id not in self._detection_area:
continue
# Detection box is large enough.
if self._detection_area[image_id] < 0.01:
continue
ok = True
next_image_id = self._next_image(image_id, 'forward')
if next_image_id:
if next_image_id in self._detection_area:
ok = False
if ok:
goal_indexes.append(self._cur_graph.id_to_index[image_id])
return goal_indexes
def to_image_id(self, vid):
"""Converts vertex id to the image id.
Args:
vid: vertex id of the view.
Returns:
image id of the input vertex id.
"""
return self._cur_graph.index_to_id[vid]
def to_vertex(self, image_id):
return self._cur_graph.id_to_index[image_id]
def observation(self, view_pose):
"""Returns the observation at the given the vertex.
Args:
view_pose: pose of the view of interest.
Returns:
Observation at the given view point.
Raises:
ValueError: if the given view pose is not similar to any of the poses in
the current world.
"""
vertex = self.pose_to_vertex(view_pose)
if vertex is None:
raise ValueError('The given found is not close enough to any of the poses'
' in the environment.')
image_id = self._cur_graph.index_to_id[vertex]
output = collections.OrderedDict()
if task_env.ModalityTypes.SEMANTIC_SEGMENTATION in self._modality_types:
output[task_env.ModalityTypes.
SEMANTIC_SEGMENTATION] = self._semantic_segmentations[
self._cur_world][image_id]
detection = None
need_det = (
task_env.ModalityTypes.OBJECT_DETECTION in self._modality_types or
(task_env.ModalityTypes.IMAGE in self._modality_types and
self._should_draw_detections))
if need_det:
detection = self._detection_table[image_id]
detection_image = generate_detection_image(
detection,
self._output_size,
self._category_map,
num_classes=self._num_detection_classes)
if task_env.ModalityTypes.OBJECT_DETECTION in self._modality_types:
output[task_env.ModalityTypes.OBJECT_DETECTION] = detection_image
if task_env.ModalityTypes.DEPTH in self._modality_types:
output[task_env.ModalityTypes.DEPTH] = self._depth_images[
self._cur_world][image_id]
if task_env.ModalityTypes.IMAGE in self._modality_types:
output_img = self._cached_imgs[self._cur_world][image_id]
if self._should_draw_detections:
output_img = output_img.copy()
_draw_detections(output_img, detection, self._category_index)
output[task_env.ModalityTypes.IMAGE] = output_img
if task_env.ModalityTypes.GOAL in self._modality_types:
goal = np.zeros((len(self._targets),), dtype=np.float32)
goal[self._targets.index(self._cur_goal)] = 1.
output[task_env.ModalityTypes.GOAL] = goal
if task_env.ModalityTypes.PREV_ACTION in self._modality_types:
output[task_env.ModalityTypes.PREV_ACTION] = self._prev_action
if task_env.ModalityTypes.DISTANCE in self._modality_types:
output[task_env.ModalityTypes.DISTANCE] = np.asarray(
[self.gt_value(self._cur_goal, vertex)], dtype=np.float32)
return output
def _step_no_reward(self, action):
"""Performs a step in the environment with given action.
Args:
action: Action that is used to step in the environment. Action can be
string or integer. If the type is integer then it uses the ith element
from self._actions list. Otherwise, uses the string value as the action.
Returns:
observation, done, info
observation: dictonary that contains all the observations specified in
modality_types.
observation[task_env.ModalityTypes.OBJECT_DETECTION]: contains the
detection of the current view.
observation[task_env.ModalityTypes.IMAGE]: contains the
image of the current view. Note that if using the images for training,
should_load_images should be set to false.
observation[task_env.ModalityTypes.SEMANTIC_SEGMENTATION]: contains the
semantic segmentation of the current view.
observation[task_env.ModalityTypes.DEPTH]: If selected, returns the
depth map for the current view.
observation[task_env.ModalityTypes.PREV_ACTION]: If selected, returns
a numpy of (action_size + 1,). The first action_size elements indicate
the action and the last element indicates whether the previous action
was successful or not.
done: True after episode_length steps have been taken, False otherwise.
info: Empty dictionary.
Raises:
ValueError: for invalid actions.
"""
# Primarily used for gym interface.
if not isinstance(action, str):
if not self.action_space.contains(action):
raise ValueError('Not a valid actions: %d', action)
action = self._actions[action]
if action not in self._actions:
raise ValueError('Not a valid action: %s', action)
action_index = self._actions.index(action)
if action == 'stop':
next_image_id = self._cur_image_id
done = True
success = True
else:
next_image_id = self._next_image(self._cur_image_id, action)
self._steps_taken += 1
done = False
success = True
if not next_image_id:
success = False
else:
self._cur_image_id = next_image_id
if self._steps_taken >= self._episode_length:
done = True
cur_vertex = self._cur_graph.id_to_index[self._cur_image_id]
observation = self.observation(self.vertex_to_pose(cur_vertex))
# Concatenation of one-hot prev action + a binary number for success of
# previous actions.
self._prev_action = np.zeros((len(self._actions) + 1,), dtype=np.float32)
self._prev_action[action_index] = 1.
self._prev_action[-1] = float(success)
distance_to_goal = self.gt_value(self._cur_goal, cur_vertex)
if success:
if distance_to_goal <= self._reward_goal_range:
done = True
return observation, done, {'success': success}
@property
def graph(self):
return self._cur_graph.graph
def state(self):
return self.vertex_to_pose(self.to_vertex(self._cur_image_id))
def gt_value(self, goal, v):
"""Computes the distance to the goal from vertex v.
Args:
goal: name of the goal.
v: vertex id.
Returns:
Minimmum number of steps to the given goal.
"""
assert goal in self._cur_graph.distance_to_goal, 'goal: {}'.format(goal)
assert v in self._cur_graph.distance_to_goal[goal]
res = self._cur_graph.distance_to_goal[goal][v]
return res
def _update_graph(self):
"""Creates the graph for each environment and updates the _cur_graph."""
if self._cur_world not in self._graph_cache:
graph = nx.DiGraph()
id_to_index = {}
index_to_id = {}
image_list = self._world_id_dict[self._cur_world]
for i, image_id in enumerate(image_list):
id_to_index[image_id] = i
index_to_id[i] = image_id
graph.add_node(i)
for image_id in image_list:
for action in self._actions:
if action == 'stop':
continue
next_image = self._all_graph[self._cur_world][image_id][action]
if next_image:
graph.add_edge(
id_to_index[image_id], id_to_index[next_image], action=action)
target_indexes = {}
number_of_nodes_without_targets = graph.number_of_nodes()
distance_to_goal = {}
for goal in self._targets:
if self._cur_world not in self._annotated_targets[goal]:
continue
goal_indexes = [
id_to_index[i]
for i in self._annotated_targets[goal][self._cur_world]
if i
]
super_source_index = graph.number_of_nodes()
target_indexes[goal] = super_source_index
graph.add_node(super_source_index)
index_to_id[super_source_index] = goal
id_to_index[goal] = super_source_index
for v in goal_indexes:
graph.add_edge(v, super_source_index, action='stop')
graph.add_edge(super_source_index, v, action='stop')
distance_to_goal[goal] = {}
for v in range(number_of_nodes_without_targets):
distance_to_goal[goal][v] = len(
nx.shortest_path(graph, v, super_source_index)) - 2
self._graph_cache[self._cur_world] = _Graph(
graph, id_to_index, index_to_id, target_indexes, distance_to_goal)
self._cur_graph = self._graph_cache[self._cur_world]
def reset_for_eval(self, new_world, new_goal, new_image_id):
"""Resets to the given goal and image_id."""
return self._reset_env(new_world=new_world, new_goal=new_goal, new_image_id=new_image_id)
def get_init_config(self, path):
"""Exposes the initial state of the agent for the given path.
Args:
path: sequences of the vertexes that the agent moves.
Returns:
image_id of the first view, world, and the goal.
"""
return self._cur_graph.index_to_id[path[0]], self._cur_world, self._cur_goal
def _reset_env(
self,
new_world=None,
new_goal=None,
new_image_id=None,
):
"""Resets the agent in a random world and random id.
Args:
new_world: If not None, sets the new world to new_world.
new_goal: If not None, sets the new goal to new_goal.
new_image_id: If not None, sets the first image id to new_image_id.
Returns:
observation: dictionary of the observations. Content of the observation
is similar to that of the step function.
Raises:
ValueError: if it can't find a world and annotated goal.
"""
self._steps_taken = 0
# The first prev_action is special all zero vector + success=1.
self._prev_action = np.zeros((len(self._actions) + 1,), dtype=np.float32)
self._prev_action[len(self._actions)] = 1.
if self._eval_init_points is not None:
if self._eval_init_index >= len(self._eval_init_points):
self._eval_init_index = 0
a = self._eval_init_points[self._eval_init_index]
self._cur_world, self._cur_image_id, self._cur_goal = a
self._eval_init_index += 1
elif not new_world:
attempts = 100
found = False
while attempts >= 0:
attempts -= 1
self._cur_goal = np.random.choice(self._targets)
available_worlds = list(
set(self._annotated_targets[self._cur_goal].keys()).intersection(
set(self._worlds)))
if available_worlds:
found = True
break
if not found:
raise ValueError('could not find a world that has a target annotated')
self._cur_world = np.random.choice(available_worlds)
else:
self._cur_world = new_world
self._cur_goal = new_goal
if new_world not in self._annotated_targets[new_goal]:
return None
self._cur_goal_index = self._targets.index(self._cur_goal)
if new_image_id:
self._cur_image_id = new_image_id
else:
self._cur_image_id = np.random.choice(
self._world_id_dict[self._cur_world])
if self._cur_world not in self._detection_cache:
with tf.gfile.Open(
_get_detection_path(self._dataset_root, self._detection_folder_name,
self._cur_world)) as f:
# Each file contains a dictionary with image ids as keys and detection
# dicts as values.
self._detection_cache[self._cur_world] = np.load(f).item()
self._detection_table = self._detection_cache[self._cur_world]
self._detection_area = {}
self._update_graph()
if self._cur_world not in self._vertex_to_pose:
# adding fake pose for the super nodes of each target categories.
self._vertex_to_pose[self._cur_world] = {
index: (-index,) for index in self._cur_graph.target_indexes.values()
}
# Calling vetex_to_pose for each vertex results in filling out the
# dictionaries that contain pose related data.
for image_id in self._world_id_dict[self._cur_world]:
self.vertex_to_pose(self.to_vertex(image_id))
# Filling out pose_to_vertex from vertex_to_pose.
self._pose_to_vertex[self._cur_world] = {
tuple(v): k
for k, v in self._vertex_to_pose[self._cur_world].iteritems()
}
cur_vertex = self._cur_graph.id_to_index[self._cur_image_id]
observation = self.observation(self.vertex_to_pose(cur_vertex))
return observation
def cur_vertex(self):
return self._cur_graph.id_to_index[self._cur_image_id]
def cur_image_id(self):
return self._cur_image_id
def path_to_goal(self, image_id=None):
"""Returns the path from image_id to the self._cur_goal.
Args:
image_id: If set to None, computes the path from the current view.
Otherwise, sets the current view to the given image_id.
Returns:
The path to the goal.
Raises:
Exception if there's no path from the view to the goal.
"""
if image_id is None:
image_id = self._cur_image_id
super_source = self._cur_graph.target_indexes[self._cur_goal]
try:
path = nx.shortest_path(self._cur_graph.graph,
self._cur_graph.id_to_index[image_id],
super_source)
except:
print 'path not found, image_id = ', self._cur_world, self._cur_image_id
raise
return path[:-1]
def targets(self):
return [self.vertex_to_pose(self._cur_graph.target_indexes[self._cur_goal])]
def vertex_to_pose(self, v):
"""Returns pose of the view for a given vertex.
Args:
v: integer, vertex index.
Returns:
(x, z, dir_x, dir_z) where x and z are the tranlation and dir_x, dir_z are
a vector giving direction of the view.
"""
if v in self._vertex_to_pose[self._cur_world]:
return np.copy(self._vertex_to_pose[self._cur_world][v])
x, z, rot, scale = self._cached_poses[self._cur_world][self.to_image_id(
v)]
if rot is None: # if rotation is not provided for the given vertex.
self._vertex_to_pose[self._cur_world][v] = np.asarray(
[x * scale, z * scale, v])
return np.copy(self._vertex_to_pose[self._cur_world][v])
# Multiply rotation matrix by [0,0,1] to get a vector of length 1 in the
# direction of the ray.
direction = np.zeros((3, 1), dtype=np.float32)
direction[2][0] = 1
direction = np.matmul(np.transpose(rot), direction)
direction = [direction[0][0], direction[2][0]]
self._vertex_to_pose[self._cur_world][v] = np.asarray(
[x * scale, z * scale, direction[0], direction[1]])
return np.copy(self._vertex_to_pose[self._cur_world][v])
def pose_to_vertex(self, pose):
"""Returns the vertex id for the given pose."""
if tuple(pose) not in self._pose_to_vertex[self._cur_world]:
raise ValueError(
'The given pose is not present in the dictionary: {}'.format(
tuple(pose)))
return self._pose_to_vertex[self._cur_world][tuple(pose)]
def check_scene_graph(self, world, goal):
"""Checks the connectivity of the scene graph.
Goes over all the views. computes the shortest path to the goal. If it
crashes it means that it's not connected. Otherwise, the env graph is fine.
Args:
world: the string name of the world.
goal: the string label for the goal.
Returns:
Nothing.
"""
obs = self._reset_env(new_world=world, new_goal=goal)
if not obs:
print '{} is not availble in {}'.format(goal, world)
return True
for image_id in self._world_id_dict[self._cur_world]:
print 'check image_id = {}'.format(image_id)
self._cur_image_id = image_id
path = self.path_to_goal()
actions = []
for i in range(len(path) - 2):
actions.append(self.action(path[i], path[i + 1]))
actions.append('stop')
@property
def goal_one_hot(self):
res = np.zeros((len(self._targets),), dtype=np.float32)
res[self._cur_goal_index] = 1.
return res
@property
def goal_index(self):
return self._cur_goal_index
@property
def goal_string(self):
return self._cur_goal
@property
def worlds(self):
return self._worlds
@property
def possible_targets(self):
return self._targets
def action(self, from_pose, to_pose):
"""Returns the action that takes source vertex to destination vertex.
Args:
from_pose: pose of the source.
to_pose: pose of the destination.
Returns:
Returns the index of the action.
Raises:
ValueError: If it is not possible to go from the first vertice to second
vertice with one action, it raises value error.
"""
from_index = self.pose_to_vertex(from_pose)
to_index = self.pose_to_vertex(to_pose)
if to_index not in self.graph[from_index]:
from_image_id = self.to_image_id(from_index)
to_image_id = self.to_image_id(to_index)
raise ValueError('{},{} is not connected to {},{}'.format(
from_index, from_image_id, to_index, to_image_id))
return self._actions.index(self.graph[from_index][to_index]['action'])
def random_step_sequence(self, min_len=None, max_len=None):
"""Generates random step sequence that takes agent to the goal.
Args:
min_len: integer, minimum length of a step sequence. Not yet implemented.
max_len: integer, should be set to an integer and it is the maximum number
of observations and path length to be max_len.
Returns:
Tuple of (path, actions, states, step_outputs).
path: a random path from a random starting point and random environment.
actions: actions of the returned path.
states: viewpoints of all the states in between.
step_outputs: list of step() return tuples.
Raises:
ValueError: if first_n is not greater than zero; if min_len is different
from None.
"""
if max_len is None:
raise ValueError('max_len can not be set as None')
if max_len < 1:
raise ValueError('first_n must be greater or equal to 1.')
if min_len is not None:
raise ValueError('min_len is not yet implemented.')
path = []
actions = []
states = []
step_outputs = []
obs = self.reset()
last_obs_tuple = [obs, 0, False, {}]
for _ in xrange(max_len):
action = np.random.choice(self._actions)
# We don't want to sample stop action because stop does not add new
# information.
while action == 'stop':
action = np.random.choice(self._actions)
path.append(self.to_vertex(self._cur_image_id))
onehot = np.zeros((len(self._actions),), dtype=np.float32)
onehot[self._actions.index(action)] = 1.
actions.append(onehot)
states.append(self.vertex_to_pose(path[-1]))
step_outputs.append(copy.deepcopy(last_obs_tuple))
last_obs_tuple = self.step(action)
return path, actions, states, step_outputs
|