Spaces:

OpenDILabCommunity
/

gomoku

Sleeping

App Files Files Community

gomoku / DI-engine /ding /utils /data /dataset.py

zjowowen

init space

3dfe8fb almost 2 years ago

raw

history blame contribute delete

55.3 kB

	from typing import List, Dict, Tuple
	from ditk import logging
	from copy import deepcopy
	from easydict import EasyDict
	from torch.utils.data import Dataset
	from dataclasses import dataclass

	import pickle
	import easydict
	import torch
	import numpy as np

	from ding.utils.bfs_helper import get_vi_sequence
	from ding.utils import DATASET_REGISTRY, import_module, DatasetNormalizer
	from ding.rl_utils import discount_cumsum


	@dataclass
	class DatasetStatistics:
	"""
	Overview:
	Dataset statistics.
	"""
	mean: np.ndarray # obs
	std: np.ndarray # obs
	action_bounds: np.ndarray


	@DATASET_REGISTRY.register('naive')
	class NaiveRLDataset(Dataset):
	"""
	Overview:
	Naive RL dataset, which is used for offline RL algorithms.
	Interfaces:
	``__init__``, ``__len__``, ``__getitem__``
	"""

	def __init__(self, cfg) -> None:
	"""
	Overview:
	Initialization method.
	Arguments:
	- cfg (:obj:`dict`): Config dict.
	"""

	assert type(cfg) in [str, EasyDict], "invalid cfg type: {}".format(type(cfg))
	if isinstance(cfg, EasyDict):
	self._data_path = cfg.policy.collect.data_path
	elif isinstance(cfg, str):
	self._data_path = cfg
	with open(self._data_path, 'rb') as f:
	self._data: List[Dict[str, torch.Tensor]] = pickle.load(f)

	def __len__(self) -> int:
	"""
	Overview:
	Get the length of the dataset.
	"""

	return len(self._data)

	def __getitem__(self, idx: int) -> Dict[str, torch.Tensor]:
	"""
	Overview:
	Get the item of the dataset.
	"""

	return self._data[idx]


	@DATASET_REGISTRY.register('d4rl')
	class D4RLDataset(Dataset):
	"""
	Overview:
	D4RL dataset, which is used for offline RL algorithms.
	Interfaces:
	``__init__``, ``__len__``, ``__getitem__``
	Properties:
	- mean (:obj:`np.ndarray`): Mean of the dataset.
	- std (:obj:`np.ndarray`): Std of the dataset.
	- action_bounds (:obj:`np.ndarray`): Action bounds of the dataset.
	- statistics (:obj:`dict`): Statistics of the dataset.
	"""

	def __init__(self, cfg: dict) -> None:
	"""
	Overview:
	Initialization method.
	Arguments:
	- cfg (:obj:`dict`): Config dict.
	"""

	import gym
	try:
	import d4rl # register d4rl enviroments with open ai gym
	except ImportError:
	import sys
	logging.warning("not found d4rl env, please install it, refer to https://github.com/rail-berkeley/d4rl")
	sys.exit(1)

	# Init parameters
	data_path = cfg.policy.collect.get('data_path', None)
	env_id = cfg.env.env_id

	# Create the environment
	if data_path:
	d4rl.set_dataset_path(data_path)
	env = gym.make(env_id)
	dataset = d4rl.qlearning_dataset(env)
	self._cal_statistics(dataset, env)
	try:
	if cfg.env.norm_obs.use_norm and cfg.env.norm_obs.offline_stats.use_offline_stats:
	dataset = self._normalize_states(dataset)
	except (KeyError, AttributeError):
	# do not normalize
	pass
	self._data = []
	self._load_d4rl(dataset)

	@property
	def data(self) -> List:
	return self._data

	def __len__(self) -> int:
	"""
	Overview:
	Get the length of the dataset.
	"""

	return len(self._data)

	def __getitem__(self, idx: int) -> Dict[str, torch.Tensor]:
	"""
	Overview:
	Get the item of the dataset.
	"""

	return self._data[idx]

	def _load_d4rl(self, dataset: Dict[str, np.ndarray]) -> None:
	"""
	Overview:
	Load the d4rl dataset.
	Arguments:
	- dataset (:obj:`Dict[str, np.ndarray]`): The d4rl dataset.
	"""

	for i in range(len(dataset['observations'])):
	trans_data = {}
	trans_data['obs'] = torch.from_numpy(dataset['observations'][i])
	trans_data['next_obs'] = torch.from_numpy(dataset['next_observations'][i])
	trans_data['action'] = torch.from_numpy(dataset['actions'][i])
	trans_data['reward'] = torch.tensor(dataset['rewards'][i])
	trans_data['done'] = dataset['terminals'][i]
	self._data.append(trans_data)

	def _cal_statistics(self, dataset, env, eps=1e-3, add_action_buffer=True):
	"""
	Overview:
	Calculate the statistics of the dataset.
	Arguments:
	- dataset (:obj:`Dict[str, np.ndarray]`): The d4rl dataset.
	- env (:obj:`gym.Env`): The environment.
	- eps (:obj:`float`): Epsilon.
	"""

	self._mean = dataset['observations'].mean(0)
	self._std = dataset['observations'].std(0) + eps
	action_max = dataset['actions'].max(0)
	action_min = dataset['actions'].min(0)
	if add_action_buffer:
	action_buffer = 0.05 * (action_max - action_min)
	action_max = (action_max + action_buffer).clip(max=env.action_space.high)
	action_min = (action_min - action_buffer).clip(min=env.action_space.low)
	self._action_bounds = np.stack([action_min, action_max], axis=0)

	def _normalize_states(self, dataset):
	"""
	Overview:
	Normalize the states.
	Arguments:
	- dataset (:obj:`Dict[str, np.ndarray]`): The d4rl dataset.
	"""

	dataset['observations'] = (dataset['observations'] - self._mean) / self._std
	dataset['next_observations'] = (dataset['next_observations'] - self._mean) / self._std
	return dataset

	@property
	def mean(self):
	"""
	Overview:
	Get the mean of the dataset.
	"""

	return self._mean

	@property
	def std(self):
	"""
	Overview:
	Get the std of the dataset.
	"""

	return self._std

	@property
	def action_bounds(self) -> np.ndarray:
	"""
	Overview:
	Get the action bounds of the dataset.
	"""

	return self._action_bounds

	@property
	def statistics(self) -> dict:
	"""
	Overview:
	Get the statistics of the dataset.
	"""

	return DatasetStatistics(mean=self.mean, std=self.std, action_bounds=self.action_bounds)


	@DATASET_REGISTRY.register('hdf5')
	class HDF5Dataset(Dataset):
	"""
	Overview:
	HDF5 dataset is saved in hdf5 format, which is used for offline RL algorithms.
	The hdf5 format is a common format for storing large numerical arrays in Python.
	For more details, please refer to https://support.hdfgroup.org/HDF5/.
	Interfaces:
	``__init__``, ``__len__``, ``__getitem__``
	Properties:
	- mean (:obj:`np.ndarray`): Mean of the dataset.
	- std (:obj:`np.ndarray`): Std of the dataset.
	- action_bounds (:obj:`np.ndarray`): Action bounds of the dataset.
	- statistics (:obj:`dict`): Statistics of the dataset.
	"""

	def __init__(self, cfg: dict) -> None:
	"""
	Overview:
	Initialization method.
	Arguments:
	- cfg (:obj:`dict`): Config dict.
	"""

	try:
	import h5py
	except ImportError:
	import sys
	logging.warning("not found h5py package, please install it trough `pip install h5py ")
	sys.exit(1)
	data_path = cfg.policy.collect.get('data_path', None)
	if 'dataset' in cfg:
	self.context_len = cfg.dataset.context_len
	else:
	self.context_len = 0
	data = h5py.File(data_path, 'r')
	self._load_data(data)
	self._cal_statistics()
	try:
	if cfg.env.norm_obs.use_norm and cfg.env.norm_obs.offline_stats.use_offline_stats:
	self._normalize_states()
	except (KeyError, AttributeError):
	# do not normalize
	pass

	def __len__(self) -> int:
	"""
	Overview:
	Get the length of the dataset.
	"""

	return len(self._data['obs']) - self.context_len

	def __getitem__(self, idx: int) -> Dict[str, torch.Tensor]:
	"""
	Overview:
	Get the item of the dataset.
	Arguments:
	- idx (:obj:`int`): The index of the dataset.
	"""

	if self.context_len == 0: # for other offline RL algorithms
	return {k: self._data[k][idx] for k in self._data.keys()}
	else: # for decision transformer
	block_size = self.context_len
	done_idx = idx + block_size
	idx = done_idx - block_size
	states = torch.as_tensor(
	np.array(self._data['obs'][idx:done_idx]), dtype=torch.float32
	).view(block_size, -1)
	actions = torch.as_tensor(self._data['action'][idx:done_idx], dtype=torch.long)
	rtgs = torch.as_tensor(self._data['reward'][idx:done_idx, 0], dtype=torch.float32)
	timesteps = torch.as_tensor(range(idx, done_idx), dtype=torch.int64)
	traj_mask = torch.ones(self.context_len, dtype=torch.long)
	return timesteps, states, actions, rtgs, traj_mask

	def _load_data(self, dataset: Dict[str, np.ndarray]) -> None:
	"""
	Overview:
	Load the dataset.
	Arguments:
	- dataset (:obj:`Dict[str, np.ndarray]`): The dataset.
	"""

	self._data = {}
	for k in dataset.keys():
	logging.info(f'Load {k} data.')
	self._data[k] = dataset[k][:]

	def _cal_statistics(self, eps: float = 1e-3):
	"""
	Overview:
	Calculate the statistics of the dataset.
	Arguments:
	- eps (:obj:`float`): Epsilon.
	"""

	self._mean = self._data['obs'].mean(0)
	self._std = self._data['obs'].std(0) + eps
	action_max = self._data['action'].max(0)
	action_min = self._data['action'].min(0)
	buffer = 0.05 * (action_max - action_min)
	action_max = action_max.astype(float) + buffer
	action_min = action_max.astype(float) - buffer
	self._action_bounds = np.stack([action_min, action_max], axis=0)

	def _normalize_states(self):
	"""
	Overview:
	Normalize the states.
	"""

	self._data['obs'] = (self._data['obs'] - self._mean) / self._std
	self._data['next_obs'] = (self._data['next_obs'] - self._mean) / self._std

	@property
	def mean(self):
	"""
	Overview:
	Get the mean of the dataset.
	"""

	return self._mean

	@property
	def std(self):
	"""
	Overview:
	Get the std of the dataset.
	"""

	return self._std

	@property
	def action_bounds(self) -> np.ndarray:
	"""
	Overview:
	Get the action bounds of the dataset.
	"""

	return self._action_bounds

	@property
	def statistics(self) -> dict:
	"""
	Overview:
	Get the statistics of the dataset.
	"""

	return DatasetStatistics(mean=self.mean, std=self.std, action_bounds=self.action_bounds)


	@DATASET_REGISTRY.register('d4rl_trajectory')
	class D4RLTrajectoryDataset(Dataset):
	"""
	Overview:
	D4RL trajectory dataset, which is used for offline RL algorithms.
	Interfaces:
	``__init__``, ``__len__``, ``__getitem__``
	"""

	# from infos.py from official d4rl github repo
	REF_MIN_SCORE = {
	'halfcheetah': -280.178953,
	'walker2d': 1.629008,
	'hopper': -20.272305,
	}

	REF_MAX_SCORE = {
	'halfcheetah': 12135.0,
	'walker2d': 4592.3,
	'hopper': 3234.3,
	}

	# calculated from d4rl datasets
	D4RL_DATASET_STATS = {
	'halfcheetah-medium-v2': {
	'state_mean': [
	-0.06845773756504059, 0.016414547339081764, -0.18354906141757965, -0.2762460708618164,
	-0.34061527252197266, -0.09339715540409088, -0.21321271359920502, -0.0877423882484436,
	5.173007488250732, -0.04275195300579071, -0.036108363419771194, 0.14053793251514435,
	0.060498327016830444, 0.09550975263118744, 0.06739100068807602, 0.005627387668937445,
	0.013382787816226482
	],
	'state_std': [
	0.07472999393939972, 0.3023499846458435, 0.30207309126853943, 0.34417077898979187, 0.17619241774082184,
	0.507205605506897, 0.2567007839679718, 0.3294812738895416, 1.2574149370193481, 0.7600541710853577,
	1.9800915718078613, 6.565362453460693, 7.466367721557617, 4.472222805023193, 10.566964149475098,
	5.671932697296143, 7.4982590675354
	]
	},
	'halfcheetah-medium-replay-v2': {
	'state_mean': [
	-0.12880703806877136, 0.3738119602203369, -0.14995987713336945, -0.23479078710079193,
	-0.2841278612613678, -0.13096535205841064, -0.20157982409000397, -0.06517726927995682,
	3.4768247604370117, -0.02785065770149231, -0.015035249292850494, 0.07697279006242752,
	0.01266712136566639, 0.027325302362442017, 0.02316424623131752, 0.010438721626996994,
	-0.015839405357837677
	],
	'state_std': [
	0.17019015550613403, 1.284424901008606, 0.33442774415016174, 0.3672759234905243, 0.26092398166656494,
	0.4784106910228729, 0.3181420564651489, 0.33552637696266174, 2.0931615829467773, 0.8037433624267578,
	1.9044333696365356, 6.573209762573242, 7.572863578796387, 5.069749355316162, 9.10555362701416,
	6.085654258728027, 7.25300407409668
	]
	},
	'halfcheetah-medium-expert-v2': {
	'state_mean': [
	-0.05667462572455406, 0.024369969964027405, -0.061670560389757156, -0.22351515293121338,
	-0.2675151228904724, -0.07545716315507889, -0.05809682980179787, -0.027675075456500053,
	8.110626220703125, -0.06136331334710121, -0.17986927926540375, 0.25175222754478455, 0.24186332523822784,
	0.2519369423389435, 0.5879552960395813, -0.24090635776519775, -0.030184272676706314
	],
	'state_std': [
	0.06103534251451492, 0.36054104566574097, 0.45544400811195374, 0.38476887345314026, 0.2218363732099533,
	0.5667523741722107, 0.3196682929992676, 0.2852923572063446, 3.443821907043457, 0.6728139519691467,
	1.8616976737976074, 9.575807571411133, 10.029894828796387, 5.903450012207031, 12.128185272216797,
	6.4811787605285645, 6.378620147705078
	]
	},
	'walker2d-medium-v2': {
	'state_mean': [
	1.218966007232666, 0.14163373410701752, -0.03704913705587387, -0.13814310729503632, 0.5138224363327026,
	-0.04719110205769539, -0.47288352251052856, 0.042254164814949036, 2.3948874473571777,
	-0.03143199160695076, 0.04466355964541435, -0.023907244205474854, -0.1013401448726654,
	0.09090937674045563, -0.004192637279629707, -0.12120571732521057, -0.5497063994407654
	],
	'state_std': [
	0.12311358004808426, 0.3241879940032959, 0.11456084251403809, 0.2623065710067749, 0.5640279054641724,
	0.2271878570318222, 0.3837319612503052, 0.7373676896095276, 1.2387926578521729, 0.798020601272583,
	1.5664079189300537, 1.8092705011367798, 3.025604248046875, 4.062486171722412, 1.4586567878723145,
	3.7445690631866455, 5.5851287841796875
	]
	},
	'walker2d-medium-replay-v2': {
	'state_mean': [
	1.209364652633667, 0.13264022767543793, -0.14371201395988464, -0.2046516090631485, 0.5577612519264221,
	-0.03231537342071533, -0.2784661054611206, 0.19130706787109375, 1.4701707363128662,
	-0.12504704296588898, 0.0564953051507473, -0.09991033375263214, -0.340340256690979, 0.03546293452382088,
	-0.08934258669614792, -0.2992438077926636, -0.5984178185462952
	],
	'state_std': [
	0.11929835379123688, 0.3562574088573456, 0.25852200388908386, 0.42075422406196594, 0.5202291011810303,
	0.15685082972049713, 0.36770978569984436, 0.7161387801170349, 1.3763766288757324, 0.8632221817970276,
	2.6364643573760986, 3.0134117603302, 3.720684051513672, 4.867283821105957, 2.6681625843048096,
	3.845186948776245, 5.4768385887146
	]
	},
	'walker2d-medium-expert-v2': {
	'state_mean': [
	1.2294334173202515, 0.16869689524173737, -0.07089081406593323, -0.16197483241558075,
	0.37101927399635315, -0.012209027074277401, -0.42461398243904114, 0.18986578285694122,
	3.162475109100342, -0.018092676997184753, 0.03496946766972542, -0.013921679928898811,
	-0.05937029421329498, -0.19549426436424255, -0.0019200450042262673, -0.062483321875333786,
	-0.27366524934768677
	],
	'state_std': [
	0.09932824969291687, 0.25981399416923523, 0.15062759816646576, 0.24249176681041718, 0.6758718490600586,
	0.1650741547346115, 0.38140663504600525, 0.6962361335754395, 1.3501490354537964, 0.7641991376876831,
	1.534574270248413, 2.1785972118377686, 3.276582717895508, 4.766193866729736, 1.1716983318328857,
	4.039782524108887, 5.891613960266113
	]
	},
	'hopper-medium-v2': {
	'state_mean': [
	1.311279058456421, -0.08469521254301071, -0.5382719039916992, -0.07201576232910156, 0.04932365566492081,
	2.1066856384277344, -0.15017354488372803, 0.008783451281487942, -0.2848185896873474,
	-0.18540096282958984, -0.28461286425590515
	],
	'state_std': [
	0.17790751159191132, 0.05444620922207832, 0.21297138929367065, 0.14530418813228607, 0.6124444007873535,
	0.8517446517944336, 1.4515252113342285, 0.6751695871353149, 1.5362390279769897, 1.616074562072754,
	5.607253551483154
	]
	},
	'hopper-medium-replay-v2': {
	'state_mean': [
	1.2305138111114502, -0.04371410980820656, -0.44542956352233887, -0.09370097517967224,
	0.09094487875699997, 1.3694725036621094, -0.19992674887180328, -0.022861352190375328,
	-0.5287045240402222, -0.14465883374214172, -0.19652697443962097
	],
	'state_std': [
	0.1756512075662613, 0.0636928603053093, 0.3438323438167572, 0.19566889107227325, 0.5547984838485718,
	1.051029920578003, 1.158307671546936, 0.7963128685951233, 1.4802359342575073, 1.6540331840515137,
	5.108601093292236
	]
	},
	'hopper-medium-expert-v2': {
	'state_mean': [
	1.3293815851211548, -0.09836531430482864, -0.5444297790527344, -0.10201650857925415,
	0.02277466468513012, 2.3577215671539307, -0.06349576264619827, -0.00374026270583272,
	-0.1766270101070404, -0.11862941086292267, -0.12097819894552231
	],
	'state_std': [
	0.17012375593185425, 0.05159067362546921, 0.18141433596611023, 0.16430604457855225, 0.6023368239402771,
	0.7737284898757935, 1.4986555576324463, 0.7483318448066711, 1.7953159809112549, 2.0530025959014893,
	5.725032806396484
	]
	},
	}

	def __init__(self, cfg: dict) -> None:
	"""
	Overview:
	Initialization method.
	Arguments:
	- cfg (:obj:`dict`): Config dict.
	"""

	dataset_path = cfg.dataset.data_dir_prefix
	rtg_scale = cfg.dataset.rtg_scale
	self.context_len = cfg.dataset.context_len
	self.env_type = cfg.dataset.env_type

	if 'hdf5' in dataset_path: # for mujoco env
	try:
	import h5py
	import collections
	except ImportError:
	import sys
	logging.warning("not found h5py package, please install it trough `pip install h5py ")
	sys.exit(1)
	dataset = h5py.File(dataset_path, 'r')

	N = dataset['rewards'].shape[0]
	data_ = collections.defaultdict(list)

	use_timeouts = False
	if 'timeouts' in dataset:
	use_timeouts = True

	episode_step = 0
	paths = []
	for i in range(N):
	done_bool = bool(dataset['terminals'][i])
	if use_timeouts:
	final_timestep = dataset['timeouts'][i]
	else:
	final_timestep = (episode_step == 1000 - 1)
	for k in ['observations', 'actions', 'rewards', 'terminals']:
	data_[k].append(dataset[k][i])
	if done_bool or final_timestep:
	episode_step = 0
	episode_data = {}
	for k in data_:
	episode_data[k] = np.array(data_[k])
	paths.append(episode_data)
	data_ = collections.defaultdict(list)
	episode_step += 1

	self.trajectories = paths

	# calculate state mean and variance and returns_to_go for all traj
	states = []
	for traj in self.trajectories:
	traj_len = traj['observations'].shape[0]
	states.append(traj['observations'])
	# calculate returns to go and rescale them
	traj['returns_to_go'] = discount_cumsum(traj['rewards'], 1.0) / rtg_scale

	# used for input normalization
	states = np.concatenate(states, axis=0)
	self.state_mean, self.state_std = np.mean(states, axis=0), np.std(states, axis=0) + 1e-6

	# normalize states
	for traj in self.trajectories:
	traj['observations'] = (traj['observations'] - self.state_mean) / self.state_std

	elif 'pkl' in dataset_path:
	if 'dqn' in dataset_path:
	# load dataset
	with open(dataset_path, 'rb') as f:
	self.trajectories = pickle.load(f)

	if isinstance(self.trajectories[0], list):
	# for our collected dataset, e.g. cartpole/lunarlander case
	trajectories_tmp = []

	original_keys = ['obs', 'next_obs', 'action', 'reward']
	keys = ['observations', 'next_observations', 'actions', 'rewards']
	trajectories_tmp = [
	{
	key: np.stack(
	[
	self.trajectories[eps_index][transition_index][o_key]
	for transition_index in range(len(self.trajectories[eps_index]))
	],
	axis=0
	)
	for key, o_key in zip(keys, original_keys)
	} for eps_index in range(len(self.trajectories))
	]
	self.trajectories = trajectories_tmp

	states = []
	for traj in self.trajectories:
	# traj_len = traj['observations'].shape[0]
	states.append(traj['observations'])
	# calculate returns to go and rescale them
	traj['returns_to_go'] = discount_cumsum(traj['rewards'], 1.0) / rtg_scale

	# used for input normalization
	states = np.concatenate(states, axis=0)
	self.state_mean, self.state_std = np.mean(states, axis=0), np.std(states, axis=0) + 1e-6

	# normalize states
	for traj in self.trajectories:
	traj['observations'] = (traj['observations'] - self.state_mean) / self.state_std
	else:
	# load dataset
	with open(dataset_path, 'rb') as f:
	self.trajectories = pickle.load(f)

	states = []
	for traj in self.trajectories:
	states.append(traj['observations'])
	# calculate returns to go and rescale them
	traj['returns_to_go'] = discount_cumsum(traj['rewards'], 1.0) / rtg_scale

	# used for input normalization
	states = np.concatenate(states, axis=0)
	self.state_mean, self.state_std = np.mean(states, axis=0), np.std(states, axis=0) + 1e-6

	# normalize states
	for traj in self.trajectories:
	traj['observations'] = (traj['observations'] - self.state_mean) / self.state_std
	else:
	# -- load data from memory (make more efficient)
	obss = []
	actions = []
	returns = [0]
	done_idxs = []
	stepwise_returns = []

	transitions_per_buffer = np.zeros(50, dtype=int)
	num_trajectories = 0
	while len(obss) < cfg.dataset.num_steps:
	buffer_num = np.random.choice(np.arange(50 - cfg.dataset.num_buffers, 50), 1)[0]
	i = transitions_per_buffer[buffer_num]
	frb = FixedReplayBuffer(
	data_dir=cfg.dataset.data_dir_prefix + '/1/replay_logs',
	replay_suffix=buffer_num,
	observation_shape=(84, 84),
	stack_size=4,
	update_horizon=1,
	gamma=0.99,
	observation_dtype=np.uint8,
	batch_size=32,
	replay_capacity=100000
	)
	if frb._loaded_buffers:
	done = False
	curr_num_transitions = len(obss)
	trajectories_to_load = cfg.dataset.trajectories_per_buffer
	while not done:
	states, ac, ret, next_states, next_action, next_reward, terminal, indices = \
	frb.sample_transition_batch(batch_size=1, indices=[i])
	states = states.transpose((0, 3, 1, 2))[0] # (1, 84, 84, 4) --> (4, 84, 84)
	obss.append(states)
	actions.append(ac[0])
	stepwise_returns.append(ret[0])
	if terminal[0]:
	done_idxs.append(len(obss))
	returns.append(0)
	if trajectories_to_load == 0:
	done = True
	else:
	trajectories_to_load -= 1
	returns[-1] += ret[0]
	i += 1
	if i >= 100000:
	obss = obss[:curr_num_transitions]
	actions = actions[:curr_num_transitions]
	stepwise_returns = stepwise_returns[:curr_num_transitions]
	returns[-1] = 0
	i = transitions_per_buffer[buffer_num]
	done = True
	num_trajectories += (cfg.dataset.trajectories_per_buffer - trajectories_to_load)
	transitions_per_buffer[buffer_num] = i

	actions = np.array(actions)
	returns = np.array(returns)
	stepwise_returns = np.array(stepwise_returns)
	done_idxs = np.array(done_idxs)

	# -- create reward-to-go dataset
	start_index = 0
	rtg = np.zeros_like(stepwise_returns)
	for i in done_idxs:
	i = int(i)
	curr_traj_returns = stepwise_returns[start_index:i]
	for j in range(i - 1, start_index - 1, -1): # start from i-1
	rtg_j = curr_traj_returns[j - start_index:i - start_index]
	rtg[j] = sum(rtg_j)
	start_index = i

	# -- create timestep dataset
	start_index = 0
	timesteps = np.zeros(len(actions) + 1, dtype=int)
	for i in done_idxs:
	i = int(i)
	timesteps[start_index:i + 1] = np.arange(i + 1 - start_index)
	start_index = i + 1

	self.obss = obss
	self.actions = actions
	self.done_idxs = done_idxs
	self.rtgs = rtg
	self.timesteps = timesteps
	# return obss, actions, returns, done_idxs, rtg, timesteps

	def get_max_timestep(self) -> int:
	"""
	Overview:
	Get the max timestep of the dataset.
	"""

	return max(self.timesteps)

	def get_state_stats(self) -> Tuple[np.ndarray, np.ndarray]:
	"""
	Overview:
	Get the state mean and std of the dataset.
	"""

	return deepcopy(self.state_mean), deepcopy(self.state_std)

	def get_d4rl_dataset_stats(self, env_d4rl_name: str) -> Dict[str, list]:
	"""
	Overview:
	Get the d4rl dataset stats.
	Arguments:
	- env_d4rl_name (:obj:`str`): The d4rl env name.
	"""

	return self.D4RL_DATASET_STATS[env_d4rl_name]

	def __len__(self) -> int:
	"""
	Overview:
	Get the length of the dataset.
	"""

	if self.env_type != 'atari':
	return len(self.trajectories)
	else:
	return len(self.obss) - self.context_len

	def __getitem__(self, idx: int) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
	"""
	Overview:
	Get the item of the dataset.
	Arguments:
	- idx (:obj:`int`): The index of the dataset.
	"""

	if self.env_type != 'atari':
	traj = self.trajectories[idx]
	traj_len = traj['observations'].shape[0]

	if traj_len > self.context_len:
	# sample random index to slice trajectory
	si = np.random.randint(0, traj_len - self.context_len)

	states = torch.from_numpy(traj['observations'][si:si + self.context_len])
	actions = torch.from_numpy(traj['actions'][si:si + self.context_len])
	returns_to_go = torch.from_numpy(traj['returns_to_go'][si:si + self.context_len])
	timesteps = torch.arange(start=si, end=si + self.context_len, step=1)

	# all ones since no padding
	traj_mask = torch.ones(self.context_len, dtype=torch.long)

	else:
	padding_len = self.context_len - traj_len

	# padding with zeros
	states = torch.from_numpy(traj['observations'])
	states = torch.cat(
	[states, torch.zeros(([padding_len] + list(states.shape[1:])), dtype=states.dtype)], dim=0
	)

	actions = torch.from_numpy(traj['actions'])
	actions = torch.cat(
	[actions, torch.zeros(([padding_len] + list(actions.shape[1:])), dtype=actions.dtype)], dim=0
	)

	returns_to_go = torch.from_numpy(traj['returns_to_go'])
	returns_to_go = torch.cat(
	[
	returns_to_go,
	torch.zeros(([padding_len] + list(returns_to_go.shape[1:])), dtype=returns_to_go.dtype)
	],
	dim=0
	)

	timesteps = torch.arange(start=0, end=self.context_len, step=1)

	traj_mask = torch.cat(
	[torch.ones(traj_len, dtype=torch.long),
	torch.zeros(padding_len, dtype=torch.long)], dim=0
	)
	return timesteps, states, actions, returns_to_go, traj_mask
	else: # mean cost less than 0.001s
	block_size = self.context_len
	done_idx = idx + block_size
	for i in self.done_idxs:
	if i > idx: # first done_idx greater than idx
	done_idx = min(int(i), done_idx)
	break
	idx = done_idx - block_size
	states = torch.as_tensor(
	np.array(self.obss[idx:done_idx]), dtype=torch.float32
	).view(block_size, -1) # (block_size, 48484)
	states = states / 255.
	actions = torch.as_tensor(self.actions[idx:done_idx], dtype=torch.long).unsqueeze(1) # (block_size, 1)
	rtgs = torch.as_tensor(self.rtgs[idx:done_idx], dtype=torch.float32).unsqueeze(1)
	timesteps = torch.as_tensor(self.timesteps[idx:idx + 1], dtype=torch.int64).unsqueeze(1)
	traj_mask = torch.ones(self.context_len, dtype=torch.long)
	return timesteps, states, actions, rtgs, traj_mask


	@DATASET_REGISTRY.register('d4rl_diffuser')
	class D4RLDiffuserDataset(Dataset):
	"""
	Overview:
	D4RL diffuser dataset, which is used for offline RL algorithms.
	Interfaces:
	``__init__``, ``__len__``, ``__getitem__``
	"""

	def __init__(self, dataset_path: str, context_len: int, rtg_scale: float) -> None:
	"""
	Overview:
	Initialization method of D4RLDiffuserDataset.
	Arguments:
	- dataset_path (:obj:`str`): The dataset path.
	- context_len (:obj:`int`): The length of the context.
	- rtg_scale (:obj:`float`): The scale of the returns to go.
	"""

	self.context_len = context_len

	# load dataset
	with open(dataset_path, 'rb') as f:
	self.trajectories = pickle.load(f)

	if isinstance(self.trajectories[0], list):
	# for our collected dataset, e.g. cartpole/lunarlander case
	trajectories_tmp = []

	original_keys = ['obs', 'next_obs', 'action', 'reward']
	keys = ['observations', 'next_observations', 'actions', 'rewards']
	for key, o_key in zip(keys, original_keys):
	trajectories_tmp = [
	{
	key: np.stack(
	[
	self.trajectories[eps_index][transition_index][o_key]
	for transition_index in range(len(self.trajectories[eps_index]))
	],
	axis=0
	)
	} for eps_index in range(len(self.trajectories))
	]
	self.trajectories = trajectories_tmp

	states = []
	for traj in self.trajectories:
	traj_len = traj['observations'].shape[0]
	states.append(traj['observations'])
	# calculate returns to go and rescale them
	traj['returns_to_go'] = discount_cumsum(traj['rewards'], 1.0) / rtg_scale

	# used for input normalization
	states = np.concatenate(states, axis=0)
	self.state_mean, self.state_std = np.mean(states, axis=0), np.std(states, axis=0) + 1e-6

	# normalize states
	for traj in self.trajectories:
	traj['observations'] = (traj['observations'] - self.state_mean) / self.state_std


	class FixedReplayBuffer(object):
	"""
	Overview:
	Object composed of a list of OutofGraphReplayBuffers.
	Interfaces:
	``__init__``, ``get_transition_elements``, ``sample_transition_batch``
	"""

	def __init__(self, data_dir, replay_suffix, args, *kwargs): # pylint: disable=keyword-arg-before-vararg
	"""
	Overview:
	Initialize the FixedReplayBuffer class.
	Arguments:
	- data_dir (:obj:`str`): log Directory from which to load the replay buffer.
	- replay_suffix (:obj:`int`): If not None, then only load the replay buffer \
	corresponding to the specific suffix in data directory.
	- args (:obj:`list`): Arbitrary extra arguments.
	- kwargs (:obj:`dict`): Arbitrary keyword arguments.

	"""

	self._args = args
	self._kwargs = kwargs
	self._data_dir = data_dir
	self._loaded_buffers = False
	self.add_count = np.array(0)
	self._replay_suffix = replay_suffix
	if not self._loaded_buffers:
	if replay_suffix is not None:
	assert replay_suffix >= 0, 'Please pass a non-negative replay suffix'
	self.load_single_buffer(replay_suffix)
	else:
	pass
	# self._load_replay_buffers(num_buffers=50)

	def load_single_buffer(self, suffix):
	"""
	Overview:
	Load a single replay buffer.
	Arguments:
	- suffix (:obj:`int`): The suffix of the replay buffer.
	"""

	replay_buffer = self._load_buffer(suffix)
	if replay_buffer is not None:
	self._replay_buffers = [replay_buffer]
	self.add_count = replay_buffer.add_count
	self._num_replay_buffers = 1
	self._loaded_buffers = True

	def _load_buffer(self, suffix):
	"""
	Overview:
	Loads a OutOfGraphReplayBuffer replay buffer.
	Arguments:
	- suffix (:obj:`int`): The suffix of the replay buffer.
	"""

	try:
	from dopamine.replay_memory import circular_replay_buffer
	STORE_FILENAME_PREFIX = circular_replay_buffer.STORE_FILENAME_PREFIX
	# pytype: disable=attribute-error
	replay_buffer = circular_replay_buffer.OutOfGraphReplayBuffer(self._args, *self._kwargs)
	replay_buffer.load(self._data_dir, suffix)
	# pytype: enable=attribute-error
	return replay_buffer
	# except tf.errors.NotFoundError:
	except:
	raise ('can not load')

	def get_transition_elements(self):
	"""
	Overview:
	Returns the transition elements.
	"""

	return self._replay_buffers[0].get_transition_elements()

	def sample_transition_batch(self, batch_size=None, indices=None):
	"""
	Overview:
	Returns a batch of transitions (including any extra contents).
	Arguments:
	- batch_size (:obj:`int`): The batch size.
	- indices (:obj:`list`): The indices of the batch.
	"""

	buffer_index = np.random.randint(self._num_replay_buffers)
	return self._replay_buffers[buffer_index].sample_transition_batch(batch_size=batch_size, indices=indices)


	class PCDataset(Dataset):
	"""
	Overview:
	Dataset for Procedure Cloning.
	Interfaces:
	``__init__``, ``__len__``, ``__getitem__``
	"""

	def __init__(self, all_data):
	"""
	Overview:
	Initialization method of PCDataset.
	Arguments:
	- all_data (:obj:`tuple`): The tuple of all data.
	"""

	self._data = all_data

	def __getitem__(self, item):
	"""
	Overview:
	Get the item of the dataset.
	Arguments:
	- item (:obj:`int`): The index of the dataset.
	"""

	return {'obs': self._data[0][item], 'bfs_in': self._data[1][item], 'bfs_out': self._data[2][item]}

	def __len__(self):
	"""
	Overview:
	Get the length of the dataset.
	"""

	return self._data[0].shape[0]


	def load_bfs_datasets(train_seeds=1, test_seeds=5):
	"""
	Overview:
	Load BFS datasets.
	Arguments:
	- train_seeds (:obj:`int`): The number of train seeds.
	- test_seeds (:obj:`int`): The number of test seeds.
	"""

	from dizoo.maze.envs import Maze

	def load_env(seed):
	ccc = easydict.EasyDict({'size': 16})
	e = Maze(ccc)
	e.seed(seed)
	e.reset()
	return e

	envs = [load_env(i) for i in range(train_seeds + test_seeds)]

	observations_train = []
	observations_test = []
	bfs_input_maps_train = []
	bfs_input_maps_test = []
	bfs_output_maps_train = []
	bfs_output_maps_test = []
	for idx, env in enumerate(envs):
	if idx < train_seeds:
	observations = observations_train
	bfs_input_maps = bfs_input_maps_train
	bfs_output_maps = bfs_output_maps_train
	else:
	observations = observations_test
	bfs_input_maps = bfs_input_maps_test
	bfs_output_maps = bfs_output_maps_test

	start_obs = env.process_states(env._get_obs(), env.get_maze_map())
	_, track_back = get_vi_sequence(env, start_obs)
	env_observations = torch.stack([track_back[i][0] for i in range(len(track_back))], dim=0)

	for i in range(env_observations.shape[0]):
	bfs_sequence, _ = get_vi_sequence(env, env_observations[i].numpy().astype(np.int32)) # [L, W, W]
	bfs_input_map = env.n_action * np.ones([env.size, env.size], dtype=np.long)

	for j in range(bfs_sequence.shape[0]):
	bfs_input_maps.append(torch.from_numpy(bfs_input_map))
	bfs_output_maps.append(torch.from_numpy(bfs_sequence[j]))
	observations.append(env_observations[i])
	bfs_input_map = bfs_sequence[j]

	train_data = PCDataset(
	(
	torch.stack(observations_train, dim=0),
	torch.stack(bfs_input_maps_train, dim=0),
	torch.stack(bfs_output_maps_train, dim=0),
	)
	)
	test_data = PCDataset(
	(
	torch.stack(observations_test, dim=0),
	torch.stack(bfs_input_maps_test, dim=0),
	torch.stack(bfs_output_maps_test, dim=0),
	)
	)

	return train_data, test_data


	@DATASET_REGISTRY.register('bco')
	class BCODataset(Dataset):
	"""
	Overview:
	Dataset for Behavioral Cloning from Observation.
	Interfaces:
	``__init__``, ``__len__``, ``__getitem__``
	Properties:
	- obs (:obj:`np.ndarray`): The observation array.
	- action (:obj:`np.ndarray`): The action array.
	"""

	def __init__(self, data=None):
	"""
	Overview:
	Initialization method of BCODataset.
	Arguments:
	- data (:obj:`dict`): The data dict.
	"""

	if data is None:
	raise ValueError('Dataset can not be empty!')
	else:
	self._data = data

	def __len__(self):
	"""
	Overview:
	Get the length of the dataset.
	"""

	return len(self._data['obs'])

	def __getitem__(self, idx):
	"""
	Overview:
	Get the item of the dataset.
	Arguments:
	- idx (:obj:`int`): The index of the dataset.
	"""

	return {k: self._data[k][idx] for k in self._data.keys()}

	@property
	def obs(self):
	"""
	Overview:
	Get the observation array.
	"""

	return self._data['obs']

	@property
	def action(self):
	"""
	Overview:
	Get the action array.
	"""

	return self._data['action']


	@DATASET_REGISTRY.register('diffuser_traj')
	class SequenceDataset(torch.utils.data.Dataset):
	"""
	Overview:
	Dataset for diffuser.
	Interfaces:
	``__init__``, ``__len__``, ``__getitem__``
	"""

	def __init__(self, cfg):
	"""
	Overview:
	Initialization method of SequenceDataset.
	Arguments:
	- cfg (:obj:`dict`): The config dict.
	"""

	import gym

	env_id = cfg.env.env_id
	data_path = cfg.policy.collect.get('data_path', None)
	env = gym.make(env_id)

	dataset = env.get_dataset()

	self.returns_scale = cfg.env.returns_scale
	self.horizon = cfg.env.horizon
	self.max_path_length = cfg.env.max_path_length
	self.discount = cfg.policy.learn.discount_factor
	self.discounts = self.discount ** np.arange(self.max_path_length)[:, None]
	self.use_padding = cfg.env.use_padding
	self.include_returns = cfg.env.include_returns
	self.env_id = cfg.env.env_id
	itr = self.sequence_dataset(env, dataset)
	self.n_episodes = 0

	fields = {}
	for k in dataset.keys():
	if 'metadata' in k:
	continue
	fields[k] = []
	fields['path_lengths'] = []

	for i, episode in enumerate(itr):
	path_length = len(episode['observations'])
	assert path_length <= self.max_path_length
	fields['path_lengths'].append(path_length)
	for key, val in episode.items():
	if key not in fields:
	fields[key] = []
	if val.ndim < 2:
	val = np.expand_dims(val, axis=-1)
	shape = (self.max_path_length, val.shape[-1])
	arr = np.zeros(shape, dtype=np.float32)
	arr[:path_length] = val
	fields[key].append(arr)
	if episode['terminals'].any() and cfg.env.termination_penalty and 'timeouts' in episode:
	assert not episode['timeouts'].any(), 'Penalized a timeout episode for early termination'
	fields['rewards'][-1][path_length - 1] += cfg.env.termination_penalty
	self.n_episodes += 1

	for k in fields.keys():
	fields[k] = np.array(fields[k])

	self.normalizer = DatasetNormalizer(fields, cfg.policy.normalizer, path_lengths=fields['path_lengths'])
	self.indices = self.make_indices(fields['path_lengths'], self.horizon)

	self.observation_dim = cfg.env.obs_dim
	self.action_dim = cfg.env.action_dim
	self.fields = fields
	self.normalize()
	self.normed = False
	if cfg.env.normed:
	self.vmin, self.vmax = self._get_bounds()
	self.normed = True

	# shapes = {key: val.shape for key, val in self.fields.items()}
	# print(f'[ datasets/mujoco ] Dataset fields: {shapes}')

	def sequence_dataset(self, env, dataset=None):
	"""
	Overview:
	Sequence the dataset.
	Arguments:
	- env (:obj:`gym.Env`): The gym env.
	"""

	import collections
	N = dataset['rewards'].shape[0]
	if 'maze2d' in env.spec.id:
	dataset = self.maze2d_set_terminals(env, dataset)
	data_ = collections.defaultdict(list)

	# The newer version of the dataset adds an explicit
	# timeouts field. Keep old method for backwards compatability.
	use_timeouts = 'timeouts' in dataset

	episode_step = 0
	for i in range(N):
	done_bool = bool(dataset['terminals'][i])
	if use_timeouts:
	final_timestep = dataset['timeouts'][i]
	else:
	final_timestep = (episode_step == env._max_episode_steps - 1)

	for k in dataset:
	if 'metadata' in k:
	continue
	data_[k].append(dataset[k][i])

	if done_bool or final_timestep:
	episode_step = 0
	episode_data = {}
	for k in data_:
	episode_data[k] = np.array(data_[k])
	if 'maze2d' in env.spec.id:
	episode_data = self.process_maze2d_episode(episode_data)
	yield episode_data
	data_ = collections.defaultdict(list)

	episode_step += 1

	def maze2d_set_terminals(self, env, dataset):
	"""
	Overview:
	Set the terminals for maze2d.
	Arguments:
	- env (:obj:`gym.Env`): The gym env.
	- dataset (:obj:`dict`): The dataset dict.
	"""

	goal = env.get_target()
	threshold = 0.5

	xy = dataset['observations'][:, :2]
	distances = np.linalg.norm(xy - goal, axis=-1)
	at_goal = distances < threshold
	timeouts = np.zeros_like(dataset['timeouts'])

	# timeout at time t iff
	# at goal at time t and
	# not at goal at time t + 1
	timeouts[:-1] = at_goal[:-1] * ~at_goal[1:]

	timeout_steps = np.where(timeouts)[0]
	path_lengths = timeout_steps[1:] - timeout_steps[:-1]

	print(
	f'[ utils/preprocessing ] Segmented {env.spec.id} \| {len(path_lengths)} paths \| '
	f'min length: {path_lengths.min()} \| max length: {path_lengths.max()}'
	)

	dataset['timeouts'] = timeouts
	return dataset

	def process_maze2d_episode(self, episode):
	"""
	Overview:
	Process the maze2d episode, adds in `next_observations` field to episode.
	Arguments:
	- episode (:obj:`dict`): The episode dict.
	"""

	assert 'next_observations' not in episode
	length = len(episode['observations'])
	next_observations = episode['observations'][1:].copy()
	for key, val in episode.items():
	episode[key] = val[:-1]
	episode['next_observations'] = next_observations
	return episode

	def normalize(self, keys=['observations', 'actions']):
	"""
	Overview:
	Normalize the dataset, normalize fields that will be predicted by the diffusion model
	Arguments:
	- keys (:obj:`list`): The list of keys.
	"""

	for key in keys:
	array = self.fields[key].reshape(self.n_episodes * self.max_path_length, -1)
	normed = self.normalizer.normalize(array, key)
	self.fields[f'normed_{key}'] = normed.reshape(self.n_episodes, self.max_path_length, -1)

	def make_indices(self, path_lengths, horizon):
	"""
	Overview:
	Make indices for sampling from dataset. Each index maps to a datapoint.
	Arguments:
	- path_lengths (:obj:`np.ndarray`): The path length array.
	- horizon (:obj:`int`): The horizon.
	"""

	indices = []
	for i, path_length in enumerate(path_lengths):
	max_start = min(path_length - 1, self.max_path_length - horizon)
	if not self.use_padding:
	max_start = min(max_start, path_length - horizon)
	for start in range(max_start):
	end = start + horizon
	indices.append((i, start, end))
	indices = np.array(indices)
	return indices

	def get_conditions(self, observations):
	"""
	Overview:
	Get the conditions on current observation for planning.
	Arguments:
	- observations (:obj:`np.ndarray`): The observation array.
	"""

	if 'maze2d' in self.env_id:
	return {'condition_id': [0, self.horizon - 1], 'condition_val': [observations[0], observations[-1]]}
	else:
	return {'condition_id': [0], 'condition_val': [observations[0]]}

	def __len__(self):
	"""
	Overview:
	Get the length of the dataset.
	"""

	return len(self.indices)

	def _get_bounds(self):
	"""
	Overview:
	Get the bounds of the dataset.
	"""

	print('[ datasets/sequence ] Getting value dataset bounds...', end=' ', flush=True)
	vmin = np.inf
	vmax = -np.inf
	for i in range(len(self.indices)):
	value = self.__getitem__(i)['returns'].item()
	vmin = min(value, vmin)
	vmax = max(value, vmax)
	print('✓')
	return vmin, vmax

	def normalize_value(self, value):
	"""
	Overview:
	Normalize the value.
	Arguments:
	- value (:obj:`np.ndarray`): The value array.
	"""

	# [0, 1]
	normed = (value - self.vmin) / (self.vmax - self.vmin)
	# [-1, 1]
	normed = normed * 2 - 1
	return normed

	def __getitem__(self, idx, eps=1e-4):
	"""
	Overview:
	Get the item of the dataset.
	Arguments:
	- idx (:obj:`int`): The index of the dataset.
	- eps (:obj:`float`): The epsilon.
	"""

	path_ind, start, end = self.indices[idx]

	observations = self.fields['normed_observations'][path_ind, start:end]
	actions = self.fields['normed_actions'][path_ind, start:end]
	done = self.fields['terminals'][path_ind, start:end]

	# conditions = self.get_conditions(observations)
	trajectories = np.concatenate([actions, observations], axis=-1)

	if self.include_returns:
	rewards = self.fields['rewards'][path_ind, start:]
	discounts = self.discounts[:len(rewards)]
	returns = (discounts * rewards).sum()
	if self.normed:
	returns = self.normalize_value(returns)
	returns = np.array([returns / self.returns_scale], dtype=np.float32)
	batch = {
	'trajectories': trajectories,
	'returns': returns,
	'done': done,
	'action': actions,
	}
	else:
	batch = {
	'trajectories': trajectories,
	'done': done,
	'action': actions,
	}

	batch.update(self.get_conditions(observations))
	return batch


	def hdf5_save(exp_data, expert_data_path):
	"""
	Overview:
	Save the data to hdf5.
	"""

	try:
	import h5py
	except ImportError:
	import sys
	logging.warning("not found h5py package, please install it trough 'pip install h5py' ")
	sys.exit(1)
	dataset = dataset = h5py.File('%s_demos.hdf5' % expert_data_path.replace('.pkl', ''), 'w')
	dataset.create_dataset('obs', data=np.array([d['obs'].numpy() for d in exp_data]), compression='gzip')
	dataset.create_dataset('action', data=np.array([d['action'].numpy() for d in exp_data]), compression='gzip')
	dataset.create_dataset('reward', data=np.array([d['reward'].numpy() for d in exp_data]), compression='gzip')
	dataset.create_dataset('done', data=np.array([d['done'] for d in exp_data]), compression='gzip')
	dataset.create_dataset('next_obs', data=np.array([d['next_obs'].numpy() for d in exp_data]), compression='gzip')


	def naive_save(exp_data, expert_data_path):
	"""
	Overview:
	Save the data to pickle.
	"""

	with open(expert_data_path, 'wb') as f:
	pickle.dump(exp_data, f)


	def offline_data_save_type(exp_data, expert_data_path, data_type='naive'):
	"""
	Overview:
	Save the offline data.
	"""

	globals()[data_type + '_save'](exp_data, expert_data_path)


	def create_dataset(cfg, **kwargs) -> Dataset:
	"""
	Overview:
	Create dataset.
	"""

	cfg = EasyDict(cfg)
	import_module(cfg.get('import_names', []))
	return DATASET_REGISTRY.build(cfg.policy.collect.data_type, cfg=cfg, **kwargs)