Source code for kgcnn.data.datasets.MutagenicityDataset

import os
import numpy as np

from kgcnn.data.datasets.GraphTUDataset2020 import GraphTUDataset2020


[docs]class MutagenicityDataset(GraphTUDataset2020):
    r"""Store and process Mutagenicity dataset from `TUDatasets <https://chrsmrrs.github.io/datasets/>`__ .

    Mutagenicity is a chemical compound dataset of drugs, which can be categorized into two classes:
    mutagen and non-mutagen.

    References:

        (1) Riesen, K. and Bunke, H.: IAM Graph Database Repository for Graph Based Pattern Recognition and
            Machine Learning. In: da Vitora Lobo, N. et al. (Eds.), SSPR&SPR 2008, LNCS, vol. 5342, pp. 287-297, 2008.

    """

[docs]    def __init__(self, reload=False, verbose: int = 10):
        """Initialize Mutagenicity dataset.

        Args:
            reload (bool): Whether to reload the data and make new dataset. Default is False.
            verbose (int): Print progress or info for processing where 60=silent. Default is 10.
        """
        # Use default base class init()
        super(MutagenicityDataset, self).__init__("Mutagenicity", reload=reload, verbose=verbose)

[docs]    def read_in_memory(self):
        r"""Load Mutagenicity Dataset into memory and already split into items with further cleaning and
        processing.
        """
        super(MutagenicityDataset, self).read_in_memory()

        node_translate = np.array([6, 8, 17, 1, 7, 9, 35, 16, 15, 53, 11, 19, 3, 20], dtype="int")
        atoms_translate = ['C', 'O', 'Cl', 'H', 'N', 'F', 'Br', 'S', 'P', 'I', 'Na', 'ksb', 'Li', 'Ca']
        z_translate = {node_translate[i]: atoms_translate[i] for i in range(len(node_translate))}

        edge_indices = self.obtain_property("edge_indices")
        node_labels = self.obtain_property("node_labels")
        edge_labels = self.obtain_property("edge_labels")
        graph_labels = self.obtain_property("graph_labels")

        nodes = [node_translate[np.array(x, dtype="int")][:, 0] for x in node_labels]
        atoms = [[atoms_translate[int(y[0])] for y in x] for x in node_labels]
        edges = [x[:, 0]+1 for x in edge_labels]
        labels = graph_labels

        # Require cleaning steps
        labels_clean = []
        nodes_clean = []
        edge_indices_clean = []
        edges_clean = []
        atoms_clean = []

        # Remove unconnected atoms. not Na Li etc.
        self.info("Checking database...")
        for i in range(len(nodes)):
            nats = nodes[i]
            cons = np.arange(len(nodes[i]))
            test_cons = np.sort(np.unique(edge_indices[i].flatten()))
            is_cons = np.zeros_like(cons, dtype="bool")
            is_cons[test_cons] = True
            is_cons[nats == 20] = True  # Allow to be unconnected
            is_cons[nats == 3] = True  # Allow to be unconnected
            is_cons[nats == 19] = True  # Allow to be unconnected
            is_cons[nats == 11] = True  # Allow to be unconnected
            if np.sum(is_cons) != len(cons):
                info_list = nodes[i][is_cons == False]
                info_list, info_cnt = np.unique(info_list, return_counts=True)
                info_list = {z_translate[info_list[j]]: info_cnt[j] for j in range(len(info_list))}
                self.info("Removing unconnected %s from molecule %s" % (info_list, i))
                nodes_clean.append(nats[is_cons])
                atoms_clean.append([atoms[i][j] for j in range(len(is_cons)) if is_cons[j] == True])
                # Need to correct edge_indices
                indices_used = cons[is_cons]
                indices_new = np.arange(len(indices_used))
                indices_old = np.zeros(len(nodes[i]), dtype="int")
                indices_old[indices_used] = indices_new
                edge_idx_new = indices_old[edge_indices[i]]
                edge_indices_clean.append(edge_idx_new)
            else:
                nodes_clean.append(nats)
                atoms_clean.append(atoms[i])
                edge_indices_clean.append(edge_indices[i])
            edges_clean.append(edges[i])
            labels_clean.append(labels[i])

        self.info("Database still has unconnected Na+, Li+, ksb+ etc.")

        # Since no attributes in graph dataset, we use labels as attributes
        self.assign_property("graph_labels", labels_clean)
        self.assign_property("edge_indices", edge_indices_clean)
        self.assign_property("node_attributes", nodes_clean)
        self.assign_property("edge_attributes", edges_clean)
        self.assign_property("node_labels", nodes_clean)
        self.assign_property("edge_labels", edges_clean)
        self.assign_property("node_symbol", atoms_clean)
        self.assign_property("node_number", nodes_clean)
        self.assign_property("graph_size", [len(x) for x in nodes_clean])

        # return labels,nodes,edge_indices,edges,atoms
        return self