Spaces:
Runtime error
Runtime error
| # Copyright 2018 The TensorFlow Authors All Rights Reserved. | |
| # | |
| # Licensed under the Apache License, Version 2.0 (the "License"); | |
| # you may not use this file except in compliance with the License. | |
| # You may obtain a copy of the License at | |
| # | |
| # http://www.apache.org/licenses/LICENSE-2.0 | |
| # | |
| # Unless required by applicable law or agreed to in writing, software | |
| # distributed under the License is distributed on an "AS IS" BASIS, | |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| # See the License for the specific language governing permissions and | |
| # limitations under the License. | |
| # ============================================================================== | |
| """Gym environment for the ActiveVision Dataset. | |
| The dataset is captured with a robot moving around and taking picture in | |
| multiple directions. The actions are moving in four directions, and rotate | |
| clockwise or counter clockwise. The observations are the output of vision | |
| pipelines such as object detectors. The goal is to find objects of interest | |
| in each environment. For more details, refer: | |
| http://cs.unc.edu/~ammirato/active_vision_dataset_website/. | |
| """ | |
| import tensorflow as tf | |
| import collections | |
| import copy | |
| import json | |
| import os | |
| from StringIO import StringIO | |
| import time | |
| import gym | |
| from gym.envs.registration import register | |
| import gym.spaces | |
| import networkx as nx | |
| import numpy as np | |
| import scipy.io as sio | |
| from absl import logging | |
| import gin | |
| import cv2 | |
| import label_map_util | |
| import visualization_utils as vis_util | |
| from envs import task_env | |
| register( | |
| id='active-vision-env-v0', | |
| entry_point= | |
| 'cognitive_planning.envs.active_vision_dataset_env:ActiveVisionDatasetEnv', # pylint: disable=line-too-long | |
| ) | |
| _MAX_DEPTH_VALUE = 12102 | |
| SUPPORTED_ACTIONS = [ | |
| 'right', 'rotate_cw', 'rotate_ccw', 'forward', 'left', 'backward', 'stop' | |
| ] | |
| SUPPORTED_MODALITIES = [ | |
| task_env.ModalityTypes.SEMANTIC_SEGMENTATION, | |
| task_env.ModalityTypes.DEPTH, | |
| task_env.ModalityTypes.OBJECT_DETECTION, | |
| task_env.ModalityTypes.IMAGE, | |
| task_env.ModalityTypes.GOAL, | |
| task_env.ModalityTypes.PREV_ACTION, | |
| task_env.ModalityTypes.DISTANCE, | |
| ] | |
| # Data structure for storing the information related to the graph of the world. | |
| _Graph = collections.namedtuple('_Graph', [ | |
| 'graph', 'id_to_index', 'index_to_id', 'target_indexes', 'distance_to_goal' | |
| ]) | |
| def _init_category_index(label_map_path): | |
| """Creates category index from class indexes to name of the classes. | |
| Args: | |
| label_map_path: path to the mapping. | |
| Returns: | |
| A map for mapping int keys to string categories. | |
| """ | |
| label_map = label_map_util.load_labelmap(label_map_path) | |
| num_classes = np.max(x.id for x in label_map.item) | |
| categories = label_map_util.convert_label_map_to_categories( | |
| label_map, max_num_classes=num_classes, use_display_name=True) | |
| category_index = label_map_util.create_category_index(categories) | |
| return category_index | |
| def _draw_detections(image_np, detections, category_index): | |
| """Draws detections on to the image. | |
| Args: | |
| image_np: Image in the form of uint8 numpy array. | |
| detections: a dictionary that contains the detection outputs. | |
| category_index: contains the mapping between indexes and the category names. | |
| Returns: | |
| Does not return anything but draws the boxes on the | |
| """ | |
| vis_util.visualize_boxes_and_labels_on_image_array( | |
| image_np, | |
| detections['detection_boxes'], | |
| detections['detection_classes'], | |
| detections['detection_scores'], | |
| category_index, | |
| use_normalized_coordinates=True, | |
| max_boxes_to_draw=1000, | |
| min_score_thresh=.0, | |
| agnostic_mode=False) | |
| def generate_detection_image(detections, | |
| image_size, | |
| category_map, | |
| num_classes, | |
| is_binary=True): | |
| """Generates one_hot vector of the image using the detection boxes. | |
| Args: | |
| detections: 2D object detections from the image. It's a dictionary that | |
| contains detection_boxes, detection_classes, and detection_scores with | |
| dimensions of nx4, nx1, nx1 where n is the number of detections. | |
| image_size: The resolution of the output image. | |
| category_map: dictionary that maps label names to index. | |
| num_classes: Number of classes. | |
| is_binary: If true, it sets the corresponding channels to 0 and 1. | |
| Otherwise, sets the score in the corresponding channel. | |
| Returns: | |
| Returns image_size x image_size x num_classes image for the detection boxes. | |
| """ | |
| res = np.zeros((image_size, image_size, num_classes), dtype=np.float32) | |
| boxes = detections['detection_boxes'] | |
| labels = detections['detection_classes'] | |
| scores = detections['detection_scores'] | |
| for box, label, score in zip(boxes, labels, scores): | |
| transformed_boxes = [int(round(t)) for t in box * image_size] | |
| y1, x1, y2, x2 = transformed_boxes | |
| # Detector returns fixed number of detections. Boxes with area of zero | |
| # are equivalent of boxes that don't correspond to any detection box. | |
| # So, we need to skip the boxes with area 0. | |
| if (y2 - y1) * (x2 - x1) == 0: | |
| continue | |
| assert category_map[label] < num_classes, 'label = {}'.format(label) | |
| value = score | |
| if is_binary: | |
| value = 1 | |
| res[y1:y2, x1:x2, category_map[label]] = value | |
| return res | |
| def _get_detection_path(root, detection_folder_name, world): | |
| return os.path.join(root, 'Meta', detection_folder_name, world + '.npy') | |
| def _get_image_folder(root, world): | |
| return os.path.join(root, world, 'jpg_rgb') | |
| def _get_json_path(root, world): | |
| return os.path.join(root, world, 'annotations.json') | |
| def _get_image_path(root, world, image_id): | |
| return os.path.join(_get_image_folder(root, world), image_id + '.jpg') | |
| def _get_image_list(path, worlds): | |
| """Builds a dictionary for all the worlds. | |
| Args: | |
| path: the path to the dataset on cns. | |
| worlds: list of the worlds. | |
| Returns: | |
| dictionary where the key is the world names and the values | |
| are the image_ids of that world. | |
| """ | |
| world_id_dict = {} | |
| for loc in worlds: | |
| files = [t[:-4] for t in tf.gfile.ListDir(_get_image_folder(path, loc))] | |
| world_id_dict[loc] = files | |
| return world_id_dict | |
| def read_all_poses(dataset_root, world): | |
| """Reads all the poses for each world. | |
| Args: | |
| dataset_root: the path to the root of the dataset. | |
| world: string, name of the world. | |
| Returns: | |
| Dictionary of poses for all the images in each world. The key is the image | |
| id of each view and the values are tuple of (x, z, R, scale). Where x and z | |
| are the first and third coordinate of translation. R is the 3x3 rotation | |
| matrix and scale is a float scalar that indicates the scale that needs to | |
| be multipled to x and z in order to get the real world coordinates. | |
| Raises: | |
| ValueError: if the number of images do not match the number of poses read. | |
| """ | |
| path = os.path.join(dataset_root, world, 'image_structs.mat') | |
| with tf.gfile.Open(path) as f: | |
| data = sio.loadmat(f) | |
| xyz = data['image_structs']['world_pos'] | |
| image_names = data['image_structs']['image_name'][0] | |
| rot = data['image_structs']['R'][0] | |
| scale = data['scale'][0][0] | |
| n = xyz.shape[1] | |
| x = [xyz[0][i][0][0] for i in range(n)] | |
| z = [xyz[0][i][2][0] for i in range(n)] | |
| names = [name[0][:-4] for name in image_names] | |
| if len(names) != len(x): | |
| raise ValueError('number of image names are not equal to the number of ' | |
| 'poses {} != {}'.format(len(names), len(x))) | |
| output = {} | |
| for i in range(n): | |
| if rot[i].shape[0] != 0: | |
| assert rot[i].shape[0] == 3 | |
| assert rot[i].shape[1] == 3 | |
| output[names[i]] = (x[i], z[i], rot[i], scale) | |
| else: | |
| output[names[i]] = (x[i], z[i], None, scale) | |
| return output | |
| def read_cached_data(should_load_images, dataset_root, segmentation_file_name, | |
| targets_file_name, output_size): | |
| """Reads all the necessary cached data. | |
| Args: | |
| should_load_images: whether to load the images or not. | |
| dataset_root: path to the root of the dataset. | |
| segmentation_file_name: The name of the file that contains semantic | |
| segmentation annotations. | |
| targets_file_name: The name of the file the contains targets annotated for | |
| each world. | |
| output_size: Size of the output images. This is used for pre-processing the | |
| loaded images. | |
| Returns: | |
| Dictionary of all the cached data. | |
| """ | |
| load_start = time.time() | |
| result_data = {} | |
| annotated_target_path = os.path.join(dataset_root, 'Meta', | |
| targets_file_name + '.npy') | |
| logging.info('loading targets: %s', annotated_target_path) | |
| with tf.gfile.Open(annotated_target_path) as f: | |
| result_data['targets'] = np.load(f).item() | |
| depth_image_path = os.path.join(dataset_root, 'Meta/depth_imgs.npy') | |
| logging.info('loading depth: %s', depth_image_path) | |
| with tf.gfile.Open(depth_image_path) as f: | |
| depth_data = np.load(f).item() | |
| logging.info('processing depth') | |
| for home_id in depth_data: | |
| images = depth_data[home_id] | |
| for image_id in images: | |
| depth = images[image_id] | |
| depth = cv2.resize( | |
| depth / _MAX_DEPTH_VALUE, (output_size, output_size), | |
| interpolation=cv2.INTER_NEAREST) | |
| depth_mask = (depth > 0).astype(np.float32) | |
| depth = np.dstack((depth, depth_mask)) | |
| images[image_id] = depth | |
| result_data[task_env.ModalityTypes.DEPTH] = depth_data | |
| sseg_path = os.path.join(dataset_root, 'Meta', | |
| segmentation_file_name + '.npy') | |
| logging.info('loading sseg: %s', sseg_path) | |
| with tf.gfile.Open(sseg_path) as f: | |
| sseg_data = np.load(f).item() | |
| logging.info('processing sseg') | |
| for home_id in sseg_data: | |
| images = sseg_data[home_id] | |
| for image_id in images: | |
| sseg = images[image_id] | |
| sseg = cv2.resize( | |
| sseg, (output_size, output_size), interpolation=cv2.INTER_NEAREST) | |
| images[image_id] = np.expand_dims(sseg, axis=-1).astype(np.float32) | |
| result_data[task_env.ModalityTypes.SEMANTIC_SEGMENTATION] = sseg_data | |
| if should_load_images: | |
| image_path = os.path.join(dataset_root, 'Meta/imgs.npy') | |
| logging.info('loading imgs: %s', image_path) | |
| with tf.gfile.Open(image_path) as f: | |
| image_data = np.load(f).item() | |
| result_data[task_env.ModalityTypes.IMAGE] = image_data | |
| with tf.gfile.Open(os.path.join(dataset_root, 'Meta/world_id_dict.npy')) as f: | |
| result_data['world_id_dict'] = np.load(f).item() | |
| logging.info('logging done in %f seconds', time.time() - load_start) | |
| return result_data | |
| def get_spec_dtype_map(): | |
| return {gym.spaces.Box: np.float32} | |
| class ActiveVisionDatasetEnv(task_env.TaskEnv): | |
| """Simulates the environment from ActiveVisionDataset.""" | |
| cached_data = None | |
| def __init__( | |
| self, | |
| episode_length, | |
| modality_types, | |
| confidence_threshold, | |
| output_size, | |
| worlds, | |
| targets, | |
| compute_distance, | |
| should_draw_detections, | |
| dataset_root, | |
| labelmap_path, | |
| reward_collision, | |
| reward_goal_range, | |
| num_detection_classes, | |
| segmentation_file_name, | |
| detection_folder_name, | |
| actions, | |
| targets_file_name, | |
| eval_init_points_file_name=None, | |
| shaped_reward=False, | |
| ): | |
| """Instantiates the environment for ActiveVision Dataset. | |
| Args: | |
| episode_length: the length of each episode. | |
| modality_types: a list of the strings where each entry indicates the name | |
| of the modalities to be loaded. Valid entries are "sseg", "det", | |
| "depth", "image", "distance", and "prev_action". "distance" should be | |
| used for computing metrics in tf agents. | |
| confidence_threshold: Consider detections more than confidence_threshold | |
| for potential targets. | |
| output_size: Resolution of the output image. | |
| worlds: List of the name of the worlds. | |
| targets: List of the target names. Each entry is a string label of the | |
| target category (e.g. 'fridge', 'microwave', so on). | |
| compute_distance: If True, outputs the distance of the view to the goal. | |
| should_draw_detections (bool): If True, the image returned for the | |
| observation will contains the bounding boxes. | |
| dataset_root: the path to the root folder of the dataset. | |
| labelmap_path: path to the dictionary that converts label strings to | |
| indexes. | |
| reward_collision: the reward the agents get after hitting an obstacle. | |
| It should be a non-positive number. | |
| reward_goal_range: the number of steps from goal, such that the agent is | |
| considered to have reached the goal. If the agent's distance is less | |
| than the specified goal range, the episode is also finishes by setting | |
| done = True. | |
| num_detection_classes: number of classes that detector outputs. | |
| segmentation_file_name: the name of the file that contains the semantic | |
| information. The file should be in the dataset_root/Meta/ folder. | |
| detection_folder_name: Name of the folder that contains the detections | |
| for each world. The folder should be under dataset_root/Meta/ folder. | |
| actions: The list of the action names. Valid entries are listed in | |
| SUPPORTED_ACTIONS. | |
| targets_file_name: the name of the file that contains the annotated | |
| targets. The file should be in the dataset_root/Meta/Folder | |
| eval_init_points_file_name: The name of the file that contains the initial | |
| points for evaluating the performance of the agent. If set to None, | |
| episodes start at random locations. Should be only set for evaluation. | |
| shaped_reward: Whether to add delta goal distance to the reward each step. | |
| Raises: | |
| ValueError: If one of the targets are not available in the annotated | |
| targets or the modality names are not from the domain specified above. | |
| ValueError: If one of the actions is not in SUPPORTED_ACTIONS. | |
| ValueError: If the reward_collision is a positive number. | |
| ValueError: If there is no action other than stop provided. | |
| """ | |
| if reward_collision > 0: | |
| raise ValueError('"reward" for collision should be non positive') | |
| if reward_goal_range < 0: | |
| logging.warning('environment does not terminate the episode if the agent ' | |
| 'is too close to the environment') | |
| if not modality_types: | |
| raise ValueError('modality names can not be empty') | |
| for name in modality_types: | |
| if name not in SUPPORTED_MODALITIES: | |
| raise ValueError('invalid modality type: {}'.format(name)) | |
| actions_other_than_stop_found = False | |
| for a in actions: | |
| if a != 'stop': | |
| actions_other_than_stop_found = True | |
| if a not in SUPPORTED_ACTIONS: | |
| raise ValueError('invalid action %s', a) | |
| if not actions_other_than_stop_found: | |
| raise ValueError('environment needs to have actions other than stop.') | |
| super(ActiveVisionDatasetEnv, self).__init__() | |
| self._episode_length = episode_length | |
| self._modality_types = set(modality_types) | |
| self._confidence_threshold = confidence_threshold | |
| self._output_size = output_size | |
| self._dataset_root = dataset_root | |
| self._worlds = worlds | |
| self._targets = targets | |
| self._all_graph = {} | |
| for world in self._worlds: | |
| with tf.gfile.Open(_get_json_path(self._dataset_root, world), 'r') as f: | |
| file_content = f.read() | |
| file_content = file_content.replace('.jpg', '') | |
| io = StringIO(file_content) | |
| self._all_graph[world] = json.load(io) | |
| self._cur_world = '' | |
| self._cur_image_id = '' | |
| self._cur_graph = None # Loaded by _update_graph | |
| self._steps_taken = 0 | |
| self._last_action_success = True | |
| self._category_index = _init_category_index(labelmap_path) | |
| self._category_map = dict( | |
| [(c, i) for i, c in enumerate(self._category_index)]) | |
| self._detection_cache = {} | |
| if not ActiveVisionDatasetEnv.cached_data: | |
| ActiveVisionDatasetEnv.cached_data = read_cached_data( | |
| True, self._dataset_root, segmentation_file_name, targets_file_name, | |
| self._output_size) | |
| cached_data = ActiveVisionDatasetEnv.cached_data | |
| self._world_id_dict = cached_data['world_id_dict'] | |
| self._depth_images = cached_data[task_env.ModalityTypes.DEPTH] | |
| self._semantic_segmentations = cached_data[ | |
| task_env.ModalityTypes.SEMANTIC_SEGMENTATION] | |
| self._annotated_targets = cached_data['targets'] | |
| self._cached_imgs = cached_data[task_env.ModalityTypes.IMAGE] | |
| self._graph_cache = {} | |
| self._compute_distance = compute_distance | |
| self._should_draw_detections = should_draw_detections | |
| self._reward_collision = reward_collision | |
| self._reward_goal_range = reward_goal_range | |
| self._num_detection_classes = num_detection_classes | |
| self._actions = actions | |
| self._detection_folder_name = detection_folder_name | |
| self._shaped_reward = shaped_reward | |
| self._eval_init_points = None | |
| if eval_init_points_file_name is not None: | |
| self._eval_init_index = 0 | |
| init_points_path = os.path.join(self._dataset_root, 'Meta', | |
| eval_init_points_file_name + '.npy') | |
| with tf.gfile.Open(init_points_path) as points_file: | |
| data = np.load(points_file).item() | |
| self._eval_init_points = [] | |
| for world in self._worlds: | |
| for goal in self._targets: | |
| if world in self._annotated_targets[goal]: | |
| for image_id in data[world]: | |
| self._eval_init_points.append((world, image_id[0], goal)) | |
| logging.info('loaded %d eval init points', len(self._eval_init_points)) | |
| self.action_space = gym.spaces.Discrete(len(self._actions)) | |
| obs_shapes = {} | |
| if task_env.ModalityTypes.SEMANTIC_SEGMENTATION in self._modality_types: | |
| obs_shapes[task_env.ModalityTypes.SEMANTIC_SEGMENTATION] = gym.spaces.Box( | |
| low=0, high=255, shape=(self._output_size, self._output_size, 1)) | |
| if task_env.ModalityTypes.OBJECT_DETECTION in self._modality_types: | |
| obs_shapes[task_env.ModalityTypes.OBJECT_DETECTION] = gym.spaces.Box( | |
| low=0, | |
| high=255, | |
| shape=(self._output_size, self._output_size, | |
| self._num_detection_classes)) | |
| if task_env.ModalityTypes.DEPTH in self._modality_types: | |
| obs_shapes[task_env.ModalityTypes.DEPTH] = gym.spaces.Box( | |
| low=0, | |
| high=_MAX_DEPTH_VALUE, | |
| shape=(self._output_size, self._output_size, 2)) | |
| if task_env.ModalityTypes.IMAGE in self._modality_types: | |
| obs_shapes[task_env.ModalityTypes.IMAGE] = gym.spaces.Box( | |
| low=0, high=255, shape=(self._output_size, self._output_size, 3)) | |
| if task_env.ModalityTypes.GOAL in self._modality_types: | |
| obs_shapes[task_env.ModalityTypes.GOAL] = gym.spaces.Box( | |
| low=0, high=1., shape=(len(self._targets),)) | |
| if task_env.ModalityTypes.PREV_ACTION in self._modality_types: | |
| obs_shapes[task_env.ModalityTypes.PREV_ACTION] = gym.spaces.Box( | |
| low=0, high=1., shape=(len(self._actions) + 1,)) | |
| if task_env.ModalityTypes.DISTANCE in self._modality_types: | |
| obs_shapes[task_env.ModalityTypes.DISTANCE] = gym.spaces.Box( | |
| low=0, high=255, shape=(1,)) | |
| self.observation_space = gym.spaces.Dict(obs_shapes) | |
| self._prev_action = np.zeros((len(self._actions) + 1), dtype=np.float32) | |
| # Loading all the poses. | |
| all_poses = {} | |
| for world in self._worlds: | |
| all_poses[world] = read_all_poses(self._dataset_root, world) | |
| self._cached_poses = all_poses | |
| self._vertex_to_pose = {} | |
| self._pose_to_vertex = {} | |
| def actions(self): | |
| """Returns list of actions for the env.""" | |
| return self._actions | |
| def _next_image(self, image_id, action): | |
| """Given the action, returns the name of the image that agent ends up in. | |
| Args: | |
| image_id: The image id of the current view. | |
| action: valid actions are ['right', 'rotate_cw', 'rotate_ccw', | |
| 'forward', 'left']. Each rotation is 30 degrees. | |
| Returns: | |
| The image name for the next location of the agent. If the action results | |
| in collision or it is not possible for the agent to execute that action, | |
| returns empty string. | |
| """ | |
| assert action in self._actions, 'invalid action : {}'.format(action) | |
| assert self._cur_world in self._all_graph, 'invalid world {}'.format( | |
| self._cur_world) | |
| assert image_id in self._all_graph[ | |
| self._cur_world], 'image_id {} is not in {}'.format( | |
| image_id, self._cur_world) | |
| return self._all_graph[self._cur_world][image_id][action] | |
| def _largest_detection_for_image(self, image_id, detections_dict): | |
| """Assigns area of the largest box for the view with given image id. | |
| Args: | |
| image_id: Image id of the view. | |
| detections_dict: Detections for the view. | |
| """ | |
| for cls, box, score in zip(detections_dict['detection_classes'], | |
| detections_dict['detection_boxes'], | |
| detections_dict['detection_scores']): | |
| if cls not in self._targets: | |
| continue | |
| if score < self._confidence_threshold: | |
| continue | |
| ymin, xmin, ymax, xmax = box | |
| area = (ymax - ymin) * (xmax - xmin) | |
| if abs(area) < 1e-5: | |
| continue | |
| if image_id not in self._detection_area: | |
| self._detection_area[image_id] = area | |
| else: | |
| self._detection_area[image_id] = max(self._detection_area[image_id], | |
| area) | |
| def _compute_goal_indexes(self): | |
| """Computes the goal indexes for the environment. | |
| Returns: | |
| The indexes of the goals that are closest to target categories. A vertex | |
| is goal vertice if the desired objects are detected in the image and the | |
| target categories are not seen by moving forward from that vertice. | |
| """ | |
| for image_id in self._world_id_dict[self._cur_world]: | |
| detections_dict = self._detection_table[image_id] | |
| self._largest_detection_for_image(image_id, detections_dict) | |
| goal_indexes = [] | |
| for image_id in self._world_id_dict[self._cur_world]: | |
| if image_id not in self._detection_area: | |
| continue | |
| # Detection box is large enough. | |
| if self._detection_area[image_id] < 0.01: | |
| continue | |
| ok = True | |
| next_image_id = self._next_image(image_id, 'forward') | |
| if next_image_id: | |
| if next_image_id in self._detection_area: | |
| ok = False | |
| if ok: | |
| goal_indexes.append(self._cur_graph.id_to_index[image_id]) | |
| return goal_indexes | |
| def to_image_id(self, vid): | |
| """Converts vertex id to the image id. | |
| Args: | |
| vid: vertex id of the view. | |
| Returns: | |
| image id of the input vertex id. | |
| """ | |
| return self._cur_graph.index_to_id[vid] | |
| def to_vertex(self, image_id): | |
| return self._cur_graph.id_to_index[image_id] | |
| def observation(self, view_pose): | |
| """Returns the observation at the given the vertex. | |
| Args: | |
| view_pose: pose of the view of interest. | |
| Returns: | |
| Observation at the given view point. | |
| Raises: | |
| ValueError: if the given view pose is not similar to any of the poses in | |
| the current world. | |
| """ | |
| vertex = self.pose_to_vertex(view_pose) | |
| if vertex is None: | |
| raise ValueError('The given found is not close enough to any of the poses' | |
| ' in the environment.') | |
| image_id = self._cur_graph.index_to_id[vertex] | |
| output = collections.OrderedDict() | |
| if task_env.ModalityTypes.SEMANTIC_SEGMENTATION in self._modality_types: | |
| output[task_env.ModalityTypes. | |
| SEMANTIC_SEGMENTATION] = self._semantic_segmentations[ | |
| self._cur_world][image_id] | |
| detection = None | |
| need_det = ( | |
| task_env.ModalityTypes.OBJECT_DETECTION in self._modality_types or | |
| (task_env.ModalityTypes.IMAGE in self._modality_types and | |
| self._should_draw_detections)) | |
| if need_det: | |
| detection = self._detection_table[image_id] | |
| detection_image = generate_detection_image( | |
| detection, | |
| self._output_size, | |
| self._category_map, | |
| num_classes=self._num_detection_classes) | |
| if task_env.ModalityTypes.OBJECT_DETECTION in self._modality_types: | |
| output[task_env.ModalityTypes.OBJECT_DETECTION] = detection_image | |
| if task_env.ModalityTypes.DEPTH in self._modality_types: | |
| output[task_env.ModalityTypes.DEPTH] = self._depth_images[ | |
| self._cur_world][image_id] | |
| if task_env.ModalityTypes.IMAGE in self._modality_types: | |
| output_img = self._cached_imgs[self._cur_world][image_id] | |
| if self._should_draw_detections: | |
| output_img = output_img.copy() | |
| _draw_detections(output_img, detection, self._category_index) | |
| output[task_env.ModalityTypes.IMAGE] = output_img | |
| if task_env.ModalityTypes.GOAL in self._modality_types: | |
| goal = np.zeros((len(self._targets),), dtype=np.float32) | |
| goal[self._targets.index(self._cur_goal)] = 1. | |
| output[task_env.ModalityTypes.GOAL] = goal | |
| if task_env.ModalityTypes.PREV_ACTION in self._modality_types: | |
| output[task_env.ModalityTypes.PREV_ACTION] = self._prev_action | |
| if task_env.ModalityTypes.DISTANCE in self._modality_types: | |
| output[task_env.ModalityTypes.DISTANCE] = np.asarray( | |
| [self.gt_value(self._cur_goal, vertex)], dtype=np.float32) | |
| return output | |
| def _step_no_reward(self, action): | |
| """Performs a step in the environment with given action. | |
| Args: | |
| action: Action that is used to step in the environment. Action can be | |
| string or integer. If the type is integer then it uses the ith element | |
| from self._actions list. Otherwise, uses the string value as the action. | |
| Returns: | |
| observation, done, info | |
| observation: dictonary that contains all the observations specified in | |
| modality_types. | |
| observation[task_env.ModalityTypes.OBJECT_DETECTION]: contains the | |
| detection of the current view. | |
| observation[task_env.ModalityTypes.IMAGE]: contains the | |
| image of the current view. Note that if using the images for training, | |
| should_load_images should be set to false. | |
| observation[task_env.ModalityTypes.SEMANTIC_SEGMENTATION]: contains the | |
| semantic segmentation of the current view. | |
| observation[task_env.ModalityTypes.DEPTH]: If selected, returns the | |
| depth map for the current view. | |
| observation[task_env.ModalityTypes.PREV_ACTION]: If selected, returns | |
| a numpy of (action_size + 1,). The first action_size elements indicate | |
| the action and the last element indicates whether the previous action | |
| was successful or not. | |
| done: True after episode_length steps have been taken, False otherwise. | |
| info: Empty dictionary. | |
| Raises: | |
| ValueError: for invalid actions. | |
| """ | |
| # Primarily used for gym interface. | |
| if not isinstance(action, str): | |
| if not self.action_space.contains(action): | |
| raise ValueError('Not a valid actions: %d', action) | |
| action = self._actions[action] | |
| if action not in self._actions: | |
| raise ValueError('Not a valid action: %s', action) | |
| action_index = self._actions.index(action) | |
| if action == 'stop': | |
| next_image_id = self._cur_image_id | |
| done = True | |
| success = True | |
| else: | |
| next_image_id = self._next_image(self._cur_image_id, action) | |
| self._steps_taken += 1 | |
| done = False | |
| success = True | |
| if not next_image_id: | |
| success = False | |
| else: | |
| self._cur_image_id = next_image_id | |
| if self._steps_taken >= self._episode_length: | |
| done = True | |
| cur_vertex = self._cur_graph.id_to_index[self._cur_image_id] | |
| observation = self.observation(self.vertex_to_pose(cur_vertex)) | |
| # Concatenation of one-hot prev action + a binary number for success of | |
| # previous actions. | |
| self._prev_action = np.zeros((len(self._actions) + 1,), dtype=np.float32) | |
| self._prev_action[action_index] = 1. | |
| self._prev_action[-1] = float(success) | |
| distance_to_goal = self.gt_value(self._cur_goal, cur_vertex) | |
| if success: | |
| if distance_to_goal <= self._reward_goal_range: | |
| done = True | |
| return observation, done, {'success': success} | |
| def graph(self): | |
| return self._cur_graph.graph | |
| def state(self): | |
| return self.vertex_to_pose(self.to_vertex(self._cur_image_id)) | |
| def gt_value(self, goal, v): | |
| """Computes the distance to the goal from vertex v. | |
| Args: | |
| goal: name of the goal. | |
| v: vertex id. | |
| Returns: | |
| Minimmum number of steps to the given goal. | |
| """ | |
| assert goal in self._cur_graph.distance_to_goal, 'goal: {}'.format(goal) | |
| assert v in self._cur_graph.distance_to_goal[goal] | |
| res = self._cur_graph.distance_to_goal[goal][v] | |
| return res | |
| def _update_graph(self): | |
| """Creates the graph for each environment and updates the _cur_graph.""" | |
| if self._cur_world not in self._graph_cache: | |
| graph = nx.DiGraph() | |
| id_to_index = {} | |
| index_to_id = {} | |
| image_list = self._world_id_dict[self._cur_world] | |
| for i, image_id in enumerate(image_list): | |
| id_to_index[image_id] = i | |
| index_to_id[i] = image_id | |
| graph.add_node(i) | |
| for image_id in image_list: | |
| for action in self._actions: | |
| if action == 'stop': | |
| continue | |
| next_image = self._all_graph[self._cur_world][image_id][action] | |
| if next_image: | |
| graph.add_edge( | |
| id_to_index[image_id], id_to_index[next_image], action=action) | |
| target_indexes = {} | |
| number_of_nodes_without_targets = graph.number_of_nodes() | |
| distance_to_goal = {} | |
| for goal in self._targets: | |
| if self._cur_world not in self._annotated_targets[goal]: | |
| continue | |
| goal_indexes = [ | |
| id_to_index[i] | |
| for i in self._annotated_targets[goal][self._cur_world] | |
| if i | |
| ] | |
| super_source_index = graph.number_of_nodes() | |
| target_indexes[goal] = super_source_index | |
| graph.add_node(super_source_index) | |
| index_to_id[super_source_index] = goal | |
| id_to_index[goal] = super_source_index | |
| for v in goal_indexes: | |
| graph.add_edge(v, super_source_index, action='stop') | |
| graph.add_edge(super_source_index, v, action='stop') | |
| distance_to_goal[goal] = {} | |
| for v in range(number_of_nodes_without_targets): | |
| distance_to_goal[goal][v] = len( | |
| nx.shortest_path(graph, v, super_source_index)) - 2 | |
| self._graph_cache[self._cur_world] = _Graph( | |
| graph, id_to_index, index_to_id, target_indexes, distance_to_goal) | |
| self._cur_graph = self._graph_cache[self._cur_world] | |
| def reset_for_eval(self, new_world, new_goal, new_image_id): | |
| """Resets to the given goal and image_id.""" | |
| return self._reset_env(new_world=new_world, new_goal=new_goal, new_image_id=new_image_id) | |
| def get_init_config(self, path): | |
| """Exposes the initial state of the agent for the given path. | |
| Args: | |
| path: sequences of the vertexes that the agent moves. | |
| Returns: | |
| image_id of the first view, world, and the goal. | |
| """ | |
| return self._cur_graph.index_to_id[path[0]], self._cur_world, self._cur_goal | |
| def _reset_env( | |
| self, | |
| new_world=None, | |
| new_goal=None, | |
| new_image_id=None, | |
| ): | |
| """Resets the agent in a random world and random id. | |
| Args: | |
| new_world: If not None, sets the new world to new_world. | |
| new_goal: If not None, sets the new goal to new_goal. | |
| new_image_id: If not None, sets the first image id to new_image_id. | |
| Returns: | |
| observation: dictionary of the observations. Content of the observation | |
| is similar to that of the step function. | |
| Raises: | |
| ValueError: if it can't find a world and annotated goal. | |
| """ | |
| self._steps_taken = 0 | |
| # The first prev_action is special all zero vector + success=1. | |
| self._prev_action = np.zeros((len(self._actions) + 1,), dtype=np.float32) | |
| self._prev_action[len(self._actions)] = 1. | |
| if self._eval_init_points is not None: | |
| if self._eval_init_index >= len(self._eval_init_points): | |
| self._eval_init_index = 0 | |
| a = self._eval_init_points[self._eval_init_index] | |
| self._cur_world, self._cur_image_id, self._cur_goal = a | |
| self._eval_init_index += 1 | |
| elif not new_world: | |
| attempts = 100 | |
| found = False | |
| while attempts >= 0: | |
| attempts -= 1 | |
| self._cur_goal = np.random.choice(self._targets) | |
| available_worlds = list( | |
| set(self._annotated_targets[self._cur_goal].keys()).intersection( | |
| set(self._worlds))) | |
| if available_worlds: | |
| found = True | |
| break | |
| if not found: | |
| raise ValueError('could not find a world that has a target annotated') | |
| self._cur_world = np.random.choice(available_worlds) | |
| else: | |
| self._cur_world = new_world | |
| self._cur_goal = new_goal | |
| if new_world not in self._annotated_targets[new_goal]: | |
| return None | |
| self._cur_goal_index = self._targets.index(self._cur_goal) | |
| if new_image_id: | |
| self._cur_image_id = new_image_id | |
| else: | |
| self._cur_image_id = np.random.choice( | |
| self._world_id_dict[self._cur_world]) | |
| if self._cur_world not in self._detection_cache: | |
| with tf.gfile.Open( | |
| _get_detection_path(self._dataset_root, self._detection_folder_name, | |
| self._cur_world)) as f: | |
| # Each file contains a dictionary with image ids as keys and detection | |
| # dicts as values. | |
| self._detection_cache[self._cur_world] = np.load(f).item() | |
| self._detection_table = self._detection_cache[self._cur_world] | |
| self._detection_area = {} | |
| self._update_graph() | |
| if self._cur_world not in self._vertex_to_pose: | |
| # adding fake pose for the super nodes of each target categories. | |
| self._vertex_to_pose[self._cur_world] = { | |
| index: (-index,) for index in self._cur_graph.target_indexes.values() | |
| } | |
| # Calling vetex_to_pose for each vertex results in filling out the | |
| # dictionaries that contain pose related data. | |
| for image_id in self._world_id_dict[self._cur_world]: | |
| self.vertex_to_pose(self.to_vertex(image_id)) | |
| # Filling out pose_to_vertex from vertex_to_pose. | |
| self._pose_to_vertex[self._cur_world] = { | |
| tuple(v): k | |
| for k, v in self._vertex_to_pose[self._cur_world].iteritems() | |
| } | |
| cur_vertex = self._cur_graph.id_to_index[self._cur_image_id] | |
| observation = self.observation(self.vertex_to_pose(cur_vertex)) | |
| return observation | |
| def cur_vertex(self): | |
| return self._cur_graph.id_to_index[self._cur_image_id] | |
| def cur_image_id(self): | |
| return self._cur_image_id | |
| def path_to_goal(self, image_id=None): | |
| """Returns the path from image_id to the self._cur_goal. | |
| Args: | |
| image_id: If set to None, computes the path from the current view. | |
| Otherwise, sets the current view to the given image_id. | |
| Returns: | |
| The path to the goal. | |
| Raises: | |
| Exception if there's no path from the view to the goal. | |
| """ | |
| if image_id is None: | |
| image_id = self._cur_image_id | |
| super_source = self._cur_graph.target_indexes[self._cur_goal] | |
| try: | |
| path = nx.shortest_path(self._cur_graph.graph, | |
| self._cur_graph.id_to_index[image_id], | |
| super_source) | |
| except: | |
| print 'path not found, image_id = ', self._cur_world, self._cur_image_id | |
| raise | |
| return path[:-1] | |
| def targets(self): | |
| return [self.vertex_to_pose(self._cur_graph.target_indexes[self._cur_goal])] | |
| def vertex_to_pose(self, v): | |
| """Returns pose of the view for a given vertex. | |
| Args: | |
| v: integer, vertex index. | |
| Returns: | |
| (x, z, dir_x, dir_z) where x and z are the tranlation and dir_x, dir_z are | |
| a vector giving direction of the view. | |
| """ | |
| if v in self._vertex_to_pose[self._cur_world]: | |
| return np.copy(self._vertex_to_pose[self._cur_world][v]) | |
| x, z, rot, scale = self._cached_poses[self._cur_world][self.to_image_id( | |
| v)] | |
| if rot is None: # if rotation is not provided for the given vertex. | |
| self._vertex_to_pose[self._cur_world][v] = np.asarray( | |
| [x * scale, z * scale, v]) | |
| return np.copy(self._vertex_to_pose[self._cur_world][v]) | |
| # Multiply rotation matrix by [0,0,1] to get a vector of length 1 in the | |
| # direction of the ray. | |
| direction = np.zeros((3, 1), dtype=np.float32) | |
| direction[2][0] = 1 | |
| direction = np.matmul(np.transpose(rot), direction) | |
| direction = [direction[0][0], direction[2][0]] | |
| self._vertex_to_pose[self._cur_world][v] = np.asarray( | |
| [x * scale, z * scale, direction[0], direction[1]]) | |
| return np.copy(self._vertex_to_pose[self._cur_world][v]) | |
| def pose_to_vertex(self, pose): | |
| """Returns the vertex id for the given pose.""" | |
| if tuple(pose) not in self._pose_to_vertex[self._cur_world]: | |
| raise ValueError( | |
| 'The given pose is not present in the dictionary: {}'.format( | |
| tuple(pose))) | |
| return self._pose_to_vertex[self._cur_world][tuple(pose)] | |
| def check_scene_graph(self, world, goal): | |
| """Checks the connectivity of the scene graph. | |
| Goes over all the views. computes the shortest path to the goal. If it | |
| crashes it means that it's not connected. Otherwise, the env graph is fine. | |
| Args: | |
| world: the string name of the world. | |
| goal: the string label for the goal. | |
| Returns: | |
| Nothing. | |
| """ | |
| obs = self._reset_env(new_world=world, new_goal=goal) | |
| if not obs: | |
| print '{} is not availble in {}'.format(goal, world) | |
| return True | |
| for image_id in self._world_id_dict[self._cur_world]: | |
| print 'check image_id = {}'.format(image_id) | |
| self._cur_image_id = image_id | |
| path = self.path_to_goal() | |
| actions = [] | |
| for i in range(len(path) - 2): | |
| actions.append(self.action(path[i], path[i + 1])) | |
| actions.append('stop') | |
| def goal_one_hot(self): | |
| res = np.zeros((len(self._targets),), dtype=np.float32) | |
| res[self._cur_goal_index] = 1. | |
| return res | |
| def goal_index(self): | |
| return self._cur_goal_index | |
| def goal_string(self): | |
| return self._cur_goal | |
| def worlds(self): | |
| return self._worlds | |
| def possible_targets(self): | |
| return self._targets | |
| def action(self, from_pose, to_pose): | |
| """Returns the action that takes source vertex to destination vertex. | |
| Args: | |
| from_pose: pose of the source. | |
| to_pose: pose of the destination. | |
| Returns: | |
| Returns the index of the action. | |
| Raises: | |
| ValueError: If it is not possible to go from the first vertice to second | |
| vertice with one action, it raises value error. | |
| """ | |
| from_index = self.pose_to_vertex(from_pose) | |
| to_index = self.pose_to_vertex(to_pose) | |
| if to_index not in self.graph[from_index]: | |
| from_image_id = self.to_image_id(from_index) | |
| to_image_id = self.to_image_id(to_index) | |
| raise ValueError('{},{} is not connected to {},{}'.format( | |
| from_index, from_image_id, to_index, to_image_id)) | |
| return self._actions.index(self.graph[from_index][to_index]['action']) | |
| def random_step_sequence(self, min_len=None, max_len=None): | |
| """Generates random step sequence that takes agent to the goal. | |
| Args: | |
| min_len: integer, minimum length of a step sequence. Not yet implemented. | |
| max_len: integer, should be set to an integer and it is the maximum number | |
| of observations and path length to be max_len. | |
| Returns: | |
| Tuple of (path, actions, states, step_outputs). | |
| path: a random path from a random starting point and random environment. | |
| actions: actions of the returned path. | |
| states: viewpoints of all the states in between. | |
| step_outputs: list of step() return tuples. | |
| Raises: | |
| ValueError: if first_n is not greater than zero; if min_len is different | |
| from None. | |
| """ | |
| if max_len is None: | |
| raise ValueError('max_len can not be set as None') | |
| if max_len < 1: | |
| raise ValueError('first_n must be greater or equal to 1.') | |
| if min_len is not None: | |
| raise ValueError('min_len is not yet implemented.') | |
| path = [] | |
| actions = [] | |
| states = [] | |
| step_outputs = [] | |
| obs = self.reset() | |
| last_obs_tuple = [obs, 0, False, {}] | |
| for _ in xrange(max_len): | |
| action = np.random.choice(self._actions) | |
| # We don't want to sample stop action because stop does not add new | |
| # information. | |
| while action == 'stop': | |
| action = np.random.choice(self._actions) | |
| path.append(self.to_vertex(self._cur_image_id)) | |
| onehot = np.zeros((len(self._actions),), dtype=np.float32) | |
| onehot[self._actions.index(action)] = 1. | |
| actions.append(onehot) | |
| states.append(self.vertex_to_pose(path[-1])) | |
| step_outputs.append(copy.deepcopy(last_obs_tuple)) | |
| last_obs_tuple = self.step(action) | |
| return path, actions, states, step_outputs | |