Making-Superhero-Protein

Running

Making-Superhero-Protein / model /se3_transformer /data_loading /qm9.py

Jacob Gershon

new b

59a9ccf over 2 years ago

7.84 kB

	# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
	#
	# Permission is hereby granted, free of charge, to any person obtaining a
	# copy of this software and associated documentation files (the "Software"),
	# to deal in the Software without restriction, including without limitation
	# the rights to use, copy, modify, merge, publish, distribute, sublicense,
	# and/or sell copies of the Software, and to permit persons to whom the
	# Software is furnished to do so, subject to the following conditions:
	#
	# The above copyright notice and this permission notice shall be included in
	# all copies or substantial portions of the Software.
	#
	# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
	# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
	# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
	# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
	# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
	# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
	# DEALINGS IN THE SOFTWARE.
	#
	# SPDX-FileCopyrightText: Copyright (c) 2021 NVIDIA CORPORATION & AFFILIATES
	# SPDX-License-Identifier: MIT
	from typing import Tuple

	import dgl
	import pathlib
	import torch
	from dgl.data import QM9EdgeDataset
	from dgl import DGLGraph
	from torch import Tensor
	from torch.utils.data import random_split, DataLoader, Dataset
	from tqdm import tqdm

	from se3_transformer.data_loading.data_module import DataModule
	from se3_transformer.model.basis import get_basis
	from se3_transformer.runtime.utils import get_local_rank, str2bool, using_tensor_cores


	def _get_relative_pos(qm9_graph: DGLGraph) -> Tensor:
	x = qm9_graph.ndata['pos']
	src, dst = qm9_graph.edges()
	rel_pos = x[dst] - x[src]
	return rel_pos


	def _get_split_sizes(full_dataset: Dataset) -> Tuple[int, int, int]:
	len_full = len(full_dataset)
	len_train = 100_000
	len_test = int(0.1 * len_full)
	len_val = len_full - len_train - len_test
	return len_train, len_val, len_test


	class QM9DataModule(DataModule):
	"""
	Datamodule wrapping https://docs.dgl.ai/en/latest/api/python/dgl.data.html#qm9edge-dataset
	Training set is 100k molecules. Test set is 10% of the dataset. Validation set is the rest.
	This includes all the molecules from QM9 except the ones that are uncharacterized.
	"""

	NODE_FEATURE_DIM = 6
	EDGE_FEATURE_DIM = 4

	def __init__(self,
	data_dir: pathlib.Path,
	task: str = 'homo',
	batch_size: int = 240,
	num_workers: int = 8,
	num_degrees: int = 4,
	amp: bool = False,
	precompute_bases: bool = False,
	**kwargs):
	self.data_dir = data_dir # This needs to be before __init__ so that prepare_data has access to it
	super().__init__(batch_size=batch_size, num_workers=num_workers, collate_fn=self._collate)
	self.amp = amp
	self.task = task
	self.batch_size = batch_size
	self.num_degrees = num_degrees

	qm9_kwargs = dict(label_keys=[self.task], verbose=False, raw_dir=str(data_dir))
	if precompute_bases:
	bases_kwargs = dict(max_degree=num_degrees - 1, use_pad_trick=using_tensor_cores(amp), amp=amp)
	full_dataset = CachedBasesQM9EdgeDataset(bases_kwargs=bases_kwargs, batch_size=batch_size, **qm9_kwargs)
	else:
	full_dataset = QM9EdgeDataset(**qm9_kwargs)

	self.ds_train, self.ds_val, self.ds_test = random_split(full_dataset, _get_split_sizes(full_dataset),
	generator=torch.Generator().manual_seed(0))

	train_targets = full_dataset.targets[self.ds_train.indices, full_dataset.label_keys[0]]
	self.targets_mean = train_targets.mean()
	self.targets_std = train_targets.std()

	def prepare_data(self):
	# Download the QM9 preprocessed data
	QM9EdgeDataset(verbose=True, raw_dir=str(self.data_dir))

	def _collate(self, samples):
	graphs, y, bases = map(list, zip(samples))
	batched_graph = dgl.batch(graphs)
	edge_feats = {'0': batched_graph.edata['edge_attr'][..., None]}
	batched_graph.edata['rel_pos'] = _get_relative_pos(batched_graph)
	# get node features
	node_feats = {'0': batched_graph.ndata['attr'][:, :6, None]}
	targets = (torch.cat(y) - self.targets_mean) / self.targets_std

	if bases:
	# collate bases
	all_bases = {
	key: torch.cat([b[key] for b in bases[0]], dim=0)
	for key in bases[0][0].keys()
	}

	return batched_graph, node_feats, edge_feats, all_bases, targets
	else:
	return batched_graph, node_feats, edge_feats, targets

	@staticmethod
	def add_argparse_args(parent_parser):
	parser = parent_parser.add_argument_group("QM9 dataset")
	parser.add_argument('--task', type=str, default='homo', const='homo', nargs='?',
	choices=['mu', 'alpha', 'homo', 'lumo', 'gap', 'r2', 'zpve', 'U0', 'U', 'H', 'G', 'Cv',
	'U0_atom', 'U_atom', 'H_atom', 'G_atom', 'A', 'B', 'C'],
	help='Regression task to train on')
	parser.add_argument('--precompute_bases', type=str2bool, nargs='?', const=True, default=False,
	help='Precompute bases at the beginning of the script during dataset initialization,'
	' instead of computing them at the beginning of each forward pass.')
	return parent_parser

	def __repr__(self):
	return f'QM9({self.task})'


	class CachedBasesQM9EdgeDataset(QM9EdgeDataset):
	""" Dataset extending the QM9 dataset from DGL with precomputed (cached in RAM) pairwise bases """

	def __init__(self, bases_kwargs: dict, batch_size: int, args, *kwargs):
	"""
	:param bases_kwargs: Arguments to feed the bases computation function
	:param batch_size: Batch size to use when iterating over the dataset for computing bases
	"""
	self.bases_kwargs = bases_kwargs
	self.batch_size = batch_size
	self.bases = None
	super().__init__(args, *kwargs)

	def load(self):
	super().load()
	# Iterate through the dataset and compute bases (pairwise only)
	# Potential improvement: use multi-GPU and reduction
	dataloader = DataLoader(self, shuffle=False, batch_size=self.batch_size,
	collate_fn=lambda samples: dgl.batch([sample[0] for sample in samples]))
	bases = []
	for i, graph in tqdm(enumerate(dataloader), total=len(dataloader), desc='Precomputing QM9 bases',
	disable=get_local_rank() != 0):
	rel_pos = _get_relative_pos(graph)
	# Compute the bases with the GPU but convert the result to CPU to store in RAM
	bases.append({k: v.cpu() for k, v in get_basis(rel_pos.cuda(), **self.bases_kwargs).items()})
	self.bases = bases # Assign at the end so that __getitem__ isn't confused

	def __getitem__(self, idx: int):
	graph, label = super().__getitem__(idx)

	if self.bases:
	bases_idx = idx // self.batch_size
	bases_cumsum_idx = self.ne_cumsum[idx] - self.ne_cumsum[bases_idx * self.batch_size]
	bases_cumsum_next_idx = self.ne_cumsum[idx + 1] - self.ne_cumsum[bases_idx * self.batch_size]
	return graph, label, {key: basis[bases_cumsum_idx:bases_cumsum_next_idx] for key, basis in
	self.bases[bases_idx].items()}
	else:
	return graph, label