Source code for kgcnn.data.datasets.ISO17Dataset

import os
import numpy as np
from ase.db import connect

from kgcnn.data.base import MemoryGraphDataset
from kgcnn.data.download import DownloadDataset


[docs]class ISO17Dataset(DownloadDataset, MemoryGraphDataset): r"""Dataset 'ISO17' with molecules randomly taken from QM9 dataset [1] with a fixed composition of atoms (C7O2H10). They were arranged in different chemically valid structures and is an extension of the isomer MD data used in [2]. Information below and the dataset itself is copied and downloaded from `<http://quantum-machine.org/datasets/>`_ . The database consist of 129 molecules each containing 5000 conformational geometries, energies and forces with a resolution of 1 femto-second in the molecular dynamics trajectories. The database was generated from molecular dynamics simulations using the Fritz-Haber Institute ab initio simulation package (FHI-aims)[3]. The simulations in ISO17 were carried out using the standard quantum chemistry computational method density functional theory (DFT) in the generalized gradient approximation (GGA) with the Perdew-Burke-Ernzerhof (PBE) functional[4] and the Tkatchenko-Scheffler (TS) van der Waals correction method [5]. The dataset is stored in ASE sqlite format with the total energy in eV and forces in eV/Ang. Download-url: `<http://quantum-machine.org/datasets/iso17.tar.gz>`_ . .. code-block:: python from ase.db import connect with connect(path_to_db) as conn: for row in conn.select(limit=10): print(row.toatoms()) print(row['total_energy']) print(row.data['atomic_forces']) References: (1) R. Ramakrishnan et al. Quantum chemistry structures and properties of 134 kilo molecules. Scientific Data, 1. (2014) `<https://www.nature.com/articles/sdata201422>`_ . (2) Schütt, K. T. et al. Quantum-chemical insights from deep tensor neural networks. Nature Communications, 8, 13890. (2017) `<https://www.nature.com/articles/ncomms13890>`_ . (3) Blum, V. et al. Ab Initio Molecular Simulations with Numeric Atom-Centered Orbitals. Comput. Phys. Commun. 2009, 180 (11), 2175–2196 `<https://www.sciencedirect.com/science/article/pii/S0010465509002033>`_ . (4) Perdew, J. P. et al. Generalized Gradient Approximation Made Simple. Phys. Rev. Lett. 1996, 77 (18), 3865–3868 `<https://journals.aps.org/prl/abstract/10.1103/PhysRevLett.77.3865>`_. (5) Tkatchenko, A. et al. Accurate Molecular Van Der Waals Interactions from Ground-State Electron Density and Free-Atom Reference Data. Phys. Rev. Lett. 2009, 102 (7), 73005 `<https://journals.aps.org/prl/abstract/10.1103/PhysRevLett.102.073005>`_. (6) Schütt, K. T., et al. SchNet: A continuous-filter convolutional neural network for modeling quantum interactions. Advances in Neural Information Processing System (accepted). 2017. `<https://arxiv.org/abs/1706.08566>`_ """ download_info = { "dataset_name": "ISO17", "data_directory_name": "ISO17", "download_url": "http://quantum-machine.org/datasets/iso17.tar.gz", "download_file_name": 'iso17.tar.gz', "unpack_tar": True, "unpack_zip": False, "unpack_directory_name": "iso17" }
[docs] def __init__(self, reload=False, verbose: int = 10): r"""Initialize full :obj:`ISO17Dataset` dataset. Args: reload (bool): Whether to reload the data and make new dataset. Default is False. verbose (int): Print progress or info for processing where 60=silent. Default is 10. """ # Use default base class init() self.data_keys = None MemoryGraphDataset.__init__(self, dataset_name="ISO17", verbose=verbose) DownloadDataset.__init__(self, **self.download_info, reload=reload, verbose=verbose) self.file_name = ["reference.db", "reference_eq.db", "test_within.db", "test_other.db", "test_eq.db"] self.data_directory = os.path.join( self.data_main_dir, self.data_directory_name, self.unpack_directory_name, "iso17") self.file_directory = self.data_directory if self.fits_in_memory: self.read_in_memory()
[docs] def read_in_memory(self): r"""Load complete :obj:`ISO17Dataset` data into memory. Additionally, the different train and validation properties are set according to `http://quantum-machine.org/datasets/ <http://quantum-machine.org/datasets/>`_. The data is partitioned as used in the SchNet paper [6]: - reference.db: 80% of steps of 80% of MD trajectories - reference_eq.db: equilibrium conformations of those molecules - test_within.db: remaining 20% unseen steps of reference trajectories - test_other.db: remaining 20% unseen MD trajectories - test_eq.db: equilibrium conformations of test trajectories Where 'reference.db' and 'reference_eq.db' have 'train' property with index 0, 1, respectively, and 'test_within', 'test_other', 'test_eq' have 'test' property 0, 1, 2, respectively. The original validation geometries are noted by 'valid' property with index 0. Use :obj:`get_train_test_indices` for reading out split indices. """ data = {"formula": [], "numbers": [], "symbols": [], "positions": [], "atomic_forces": [], "total_energy": [], "id": [], "train": [], "test": []} # Add them in the order as given in doc string. 'reference' prefix are training data. for db_name, train, test in [("reference.db", 0, None), ("reference_eq.db", 1, None), ("test_within.db", None, 0), ("test_other.db", None, 1), ("test_eq.db", None, 2)]: with connect(os.path.join(self.data_directory, db_name)) as conn: for row in conn.select(): # print(str(row.toatoms())) # The ase atoms object. data["numbers"].append(row.numbers) data["id"].append(row.id) data["formula"].append(str(row.toatoms().symbols)) data["positions"].append(row.positions) data["symbols"].append(row.symbols) data["total_energy"].append(np.expand_dims(row['total_energy'], axis=-1)) data["atomic_forces"].append(row.data['atomic_forces']) data["train"].append(train) data["test"].append(test) for name, values in data.items(): self.assign_property(name, values) # Validation indices are for the first part that is 'reference.db' which are at the beginning of the dataset. # Can be simply added by index. with open(os.path.join(self.data_directory, "validation_ids.txt")) as f: valid_indices = [int(x.strip()) for x in f.readlines()] for i in valid_indices: self[i-1].update({"valid": np.array(0)}) return self