Source code for kgcnn.data.datasets.MutagenicityDataset

import os
import numpy as np

from kgcnn.data.datasets.GraphTUDataset2020 import GraphTUDataset2020


[docs]class MutagenicityDataset(GraphTUDataset2020): r"""Store and process Mutagenicity dataset from `TUDatasets <https://chrsmrrs.github.io/datasets/>`__ . Mutagenicity is a chemical compound dataset of drugs, which can be categorized into two classes: mutagen and non-mutagen. References: (1) Riesen, K. and Bunke, H.: IAM Graph Database Repository for Graph Based Pattern Recognition and Machine Learning. In: da Vitora Lobo, N. et al. (Eds.), SSPR&SPR 2008, LNCS, vol. 5342, pp. 287-297, 2008. """
[docs] def __init__(self, reload=False, verbose: int = 10): """Initialize Mutagenicity dataset. Args: reload (bool): Whether to reload the data and make new dataset. Default is False. verbose (int): Print progress or info for processing where 60=silent. Default is 10. """ # Use default base class init() super(MutagenicityDataset, self).__init__("Mutagenicity", reload=reload, verbose=verbose)
[docs] def read_in_memory(self): r"""Load Mutagenicity Dataset into memory and already split into items with further cleaning and processing. """ super(MutagenicityDataset, self).read_in_memory() node_translate = np.array([6, 8, 17, 1, 7, 9, 35, 16, 15, 53, 11, 19, 3, 20], dtype="int") atoms_translate = ['C', 'O', 'Cl', 'H', 'N', 'F', 'Br', 'S', 'P', 'I', 'Na', 'ksb', 'Li', 'Ca'] z_translate = {node_translate[i]: atoms_translate[i] for i in range(len(node_translate))} edge_indices = self.obtain_property("edge_indices") node_labels = self.obtain_property("node_labels") edge_labels = self.obtain_property("edge_labels") graph_labels = self.obtain_property("graph_labels") nodes = [node_translate[np.array(x, dtype="int")][:, 0] for x in node_labels] atoms = [[atoms_translate[int(y[0])] for y in x] for x in node_labels] edges = [x[:, 0]+1 for x in edge_labels] labels = graph_labels # Require cleaning steps labels_clean = [] nodes_clean = [] edge_indices_clean = [] edges_clean = [] atoms_clean = [] # Remove unconnected atoms. not Na Li etc. self.info("Checking database...") for i in range(len(nodes)): nats = nodes[i] cons = np.arange(len(nodes[i])) test_cons = np.sort(np.unique(edge_indices[i].flatten())) is_cons = np.zeros_like(cons, dtype="bool") is_cons[test_cons] = True is_cons[nats == 20] = True # Allow to be unconnected is_cons[nats == 3] = True # Allow to be unconnected is_cons[nats == 19] = True # Allow to be unconnected is_cons[nats == 11] = True # Allow to be unconnected if np.sum(is_cons) != len(cons): info_list = nodes[i][is_cons == False] info_list, info_cnt = np.unique(info_list, return_counts=True) info_list = {z_translate[info_list[j]]: info_cnt[j] for j in range(len(info_list))} self.info("Removing unconnected %s from molecule %s" % (info_list, i)) nodes_clean.append(nats[is_cons]) atoms_clean.append([atoms[i][j] for j in range(len(is_cons)) if is_cons[j] == True]) # Need to correct edge_indices indices_used = cons[is_cons] indices_new = np.arange(len(indices_used)) indices_old = np.zeros(len(nodes[i]), dtype="int") indices_old[indices_used] = indices_new edge_idx_new = indices_old[edge_indices[i]] edge_indices_clean.append(edge_idx_new) else: nodes_clean.append(nats) atoms_clean.append(atoms[i]) edge_indices_clean.append(edge_indices[i]) edges_clean.append(edges[i]) labels_clean.append(labels[i]) self.info("Database still has unconnected Na+, Li+, ksb+ etc.") # Since no attributes in graph dataset, we use labels as attributes self.assign_property("graph_labels", labels_clean) self.assign_property("edge_indices", edge_indices_clean) self.assign_property("node_attributes", nodes_clean) self.assign_property("edge_attributes", edges_clean) self.assign_property("node_labels", nodes_clean) self.assign_property("edge_labels", edges_clean) self.assign_property("node_symbol", atoms_clean) self.assign_property("node_number", nodes_clean) self.assign_property("graph_size", [len(x) for x in nodes_clean]) # return labels,nodes,edge_indices,edges,atoms return self