Source code for kgcnn.data.datasets.PROTEINSDataset

import os
import numpy as np

from kgcnn.data.datasets.GraphTUDataset2020 import GraphTUDataset2020
from kgcnn.molecule.encoder import OneHotEncoder


[docs]class PROTEINSDataset(GraphTUDataset2020): r"""Store and process PROTEINS dataset from `TUDatasets <https://chrsmrrs.github.io/datasets/>`__ . In `Papers with Code <https://paperswithcode.com/dataset/proteins>`__ : PROTEINS is a dataset of proteins that are classified as enzymes or non-enzymes. Nodes represent the amino acids and two nodes are connected by an edge if they are less than 6 Angstroms apart. References: (1) K. M. Borgwardt, C. S. Ong, S. Schoenauer, S. V. N. Vishwanathan, A. J. Smola, and H. P. Kriegel. Protein function prediction via graph kernels. Bioinformatics, 21(Suppl 1):i47–i56, Jun 2005. (2) P. D. Dobson and A. J. Doig. Distinguishing enzyme structures from non-enzymes without alignments. J. Mol. Biol., 330(4):771–783, Jul 2003. """
[docs] def __init__(self, reload=False, verbose: int = 10): """Initialize MUTAG dataset. Args: reload (bool): Whether to reload the data and make new dataset. Default is False. verbose (int): Print progress or info for processing where 60=silent. Default is 10. """ # Use default base class init() # We set dataset_name to None since all flags are defined by hand in subclass definition. super(PROTEINSDataset, self).__init__(dataset_name="PROTEINS", reload=reload, verbose=verbose)
[docs] def read_in_memory(self): r"""Load PROTEINS Dataset into memory and already split into items with further cleaning and processing. """ super(PROTEINSDataset, self).read_in_memory() # One-hot encoders ohe = OneHotEncoder( [-538, -345, -344, -134, -125, -96, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 41, 42, 47, 61, 63, 73, 74, 75, 82, 104, 353, 355, 360, 558, 797, 798], add_unknown=False) ohe2 = OneHotEncoder([0, 1, 2], add_unknown=False) ohe3 = OneHotEncoder([i for i in range(0, 17)], add_unknown=False) graph_labels = self.obtain_property("graph_labels") node_attributes = self.obtain_property("node_attributes") node_labels = self.obtain_property("node_labels") node_degree = self.obtain_property("node_degree") self.assign_property("graph_labels", [x - 1 for x in graph_labels]) self.assign_property("node_attributes", [np.array([ohe(int(y)) for y in x]) for x in node_attributes]) self.assign_property("node_labels", [np.array([ohe2(int(y)) for y in x]) for x in node_labels]) self.assign_property("node_degree", [np.array([ohe3(int(y)) for y in x]) for x in node_degree]) self.assign_property("graph_size", [len(x) if x is not None else None for x in node_attributes]) return self
# ds = PROTEINSDataset()