Source code for kgcnn.data.datasets.MD17Dataset

import os
import numpy as np
from typing import Union
from kgcnn.data.base import MemoryGraphDataset
from kgcnn.data.download import DownloadDataset


[docs]class MD17Dataset(DownloadDataset, MemoryGraphDataset): r"""Store and process trajectories from the :obj:`MD17Dataset` dataset. The dataset contains atomic coordinates of molecular dynamics trajectories, as well as the total energy (in kcal/mol) and forces (kcal/mol/Angstrom) on each atom. For reference data source, refer to the links `<http://www.sgdml.org/#datasets>`_ or `<http://quantum-machine.org/gdml/data/>`_ . Which trajectory is downloaded is determined by :obj:`trajectory_name` argument. There are two different versions of trajectories, which are a long trajectory on DFT level of theory and a short trajectory on coupled cluster level of theory marked in the name by 'dft' and 'ccsd_t' respectively. Overview: .. list-table:: :widths: 20 10 20 10 :header-rows: 1 * - Molecule - Level of Theory - trajectory_name - graphs * - Aspirin - DFT - aspirin_dft - 211762 * - Azobenzene - DFT - azobenzene_dft - 99999 * - Benzene - DFT - benzene2017_dft - 627983 * - Benzene - DFT - benzene2018_dft - 49863 * - Ethanol - DFT - ethanol_dft - 555092 * - Malonaldehyde - DFT - malonaldehyde_dft - 993237 * - Naphthalene - DFT - naphthalene_dft - 326250 * - Paracetamol - DFT - paracetamol_dft - 106490 * - Salicylic - DFT - salicylic_dft - 320231 * - Toluene - DFT - toluene_dft - 442790 * - Uracil - DFT - uracil_dft - 133770 * - Aspirin_ccsd - CCSD - aspirin_ccsd - 1500 * - Benzene - CCSD - benzene_ccsd_t - 1500 * - Ethanol - CCSD - ethanol_ccsd_t - 2000 * - Malonaldehyde - CCSD - malonaldehyde_ccsd_t - 1500 * - Toluene - CCSD - toluene_ccsd_t - 1501 It is recommended to use the given train-test splits. Only the requested trajectory is downloaded. """ datasets_download_info = { "CG-CG": {"download_file_name": "CG-CG.npz"}, "aspirin_dft": {"download_file_name": "aspirin_dft.npz"}, "azobenzene_dft": {"download_file_name": "azobenzene_dft.npz"}, "benzene2017_dft": {"download_file_name": "benzene2017_dft.npz"}, "benzene2018_dft": {"download_file_name": "benzene2018_dft.npz"}, "ethanol_dft": {"download_file_name": "ethanol_dft.npz"}, "malonaldehyde_dft": {"download_file_name": "malonaldehyde_dft.npz"}, "naphthalene_dft": {"download_file_name": "naphthalene_dft.npz"}, "paracetamol_dft": {"download_file_name": "paracetamol_dft.npz"}, "salicylic_dft": {"download_file_name": "salicylic_dft.npz"}, "toluene_dft": {"download_file_name": "toluene_dft.npz"}, "uracil_dft": {"download_file_name": "uracil_dft.npz"}, "aspirin_ccsd": {"download_file_name": "aspirin_ccsd.zip", "unpack_zip": True, "unpack_directory_name": "aspirin_ccsd"}, "benzene_ccsd_t": {"download_file_name": "benzene_ccsd_t.zip", "unpack_zip": True, "unpack_directory_name": "benzene_ccsd_t"}, "ethanol_ccsd_t": {"download_file_name": "ethanol_ccsd_t.zip", "unpack_zip": True, "unpack_directory_name": "ethanol_ccsd_t"}, "malonaldehyde_ccsd_t": {"download_file_name": "malonaldehyde_ccsd_t.zip", "unpack_zip": True, "unpack_directory_name": "malonaldehyde_ccsd_t"}, "toluene_ccsd_t": {"download_file_name": "toluene_ccsd_t.zip", "unpack_zip": True, "unpack_directory_name": "toluene_ccsd_t"}, }
[docs] def __init__(self, trajectory_name: str = None, reload=False, verbose=10): """Initialize MD17Dataset dataset. Args: trajectory_name (str): Name of a trajectory or molecule. reload (bool): Whether to reload the data and make new dataset. Default is False. verbose (int): Print progress or info for processing where 60=silent. Default is 10. """ self.data_keys = None self.trajectory_name = trajectory_name MemoryGraphDataset.__init__(self, dataset_name="MD17", verbose=verbose) # Prepare download if trajectory_name in self.datasets_download_info: self.download_info = self.datasets_download_info[trajectory_name] self.download_info.update({ "download_url": "http://quantum-machine.org/gdml/data/npz/%s" % self.download_info[ "download_file_name"]}) else: raise ValueError( "Can not resolve '%s' trajectory. Choose: %s." % ( trajectory_name, list(self.datasets_download_info.keys()))) DownloadDataset.__init__(self, dataset_name="MD17", data_directory_name="MD17", **self.download_info, reload=reload, verbose=verbose) self.file_name = str(self.download_file_name) if not self.unpack_zip else [ os.path.splitext(self.download_file_name)[0] + "-train.npz", os.path.splitext(self.download_file_name)[0] + "-test.npz" ] if self.unpack_directory_name is None: self.data_directory = os.path.join(self.data_main_dir, self.data_directory_name) else: self.data_directory = os.path.join(self.data_main_dir, self.data_directory_name, self.unpack_directory_name) self.dataset_name = self.dataset_name + "_" + self.trajectory_name if self.fits_in_memory: self.read_in_memory()
def _get_trajectory_from_npz(self, file_path: Union[str, list, tuple] = None): # If a filepath is given. if file_path is not None: if isinstance(file_path, (list, tuple)): return [np.load(x) for x in file_path] return np.load(file_path) # Determine filepath from dataset information. if isinstance(self.file_name, str): file_path = os.path.join(self.data_directory, self.file_name) return np.load(file_path) elif isinstance(self.file_name, (list, tuple)): file_path = [os.path.join(self.data_directory, x) for x in self.file_name] else: TypeError("Unknown type for file name '%s'." % self.file_name) return [np.load(x) for x in file_path]
[docs] def read_in_memory(self): """Load a trajectory into memory.""" data_loaded = self._get_trajectory_from_npz() def make_dict_from_data(data, is_split: dict = None): out_dict = {} data_keys = list(data.keys()) # note: Could check if all keys are available here. for key in ["R", "E", "F"]: out_dict.update({key: [np.array(x) for x in data[key]]}) num_data_points = len(out_dict["R"]) for key in ["z", 'name', 'type', 'md5', "theory"]: value = data[key] out_dict.update({key: [np.array(value) for _ in range(num_data_points)]}) if is_split is not None: for key, value in is_split.items(): out_dict.update({key: [value for _ in range(num_data_points)]}) return out_dict if isinstance(data_loaded, (list, tuple)): split_assignment = [{"train": np.array([1]), "test": None}, {"train": None, "test": np.array([1])}] prop_dicts = [make_dict_from_data(x, is_split=split_assignment[i]) for i, x in enumerate(data_loaded)] for key_prop in prop_dicts[0].keys(): # note: use from itertools import chain for multiple splits. self.assign_property(key_prop, prop_dicts[0][key_prop] + prop_dicts[1][key_prop]) else: for key_prop, value_prop in make_dict_from_data(data_loaded).items(): self.assign_property(key_prop, value_prop) return self