Source code for kgcnn.data.tudataset

import numpy as np
import os
from typing import Callable
# import logging

from kgcnn.data.base import MemoryGraphDataset

# TUDataset: A collection of benchmark datasets for learning with graphs
# by Christopher Morris and Nils M. Kriege and Franka Bause and Kristian Kersting and Petra Mutzel and Marion Neumann
# http://graphlearning.io


[docs]class GraphTUDataset(MemoryGraphDataset):
    r"""Base class for loading graph datasets published by `TU Dortmund University
    <https://chrsmrrs.github.io/datasets>`_.

    Datasets contain non-isomorphic graphs for many graph classification or regression tasks.
    This general base class has functionality to load TUDatasets in a generic way.
    The datasets are already in a graph-like format and do not need further processing via e.g. `prepare data`.

    .. note::

        Note that subclasses of `GraphTUDataset2020` in :obj:`kgcnn.data.datasets` downloads datasets,
        There are also further TU-datasets in :obj:`kgcnn.data.datasets`, if further processing is used in literature.
        Not all datasets can provide all types of graph properties like `edge_attributes` etc.

    The file structure of :obj:`GraphTUDataset` for a given dataset 'DS' (replace DS with dataset name).

    .. code-block:: console

        ├── data_directory
            ├── DS_graph_indicator.txt
            ├── DS_A.txt
            ├── DS_node_labels.txt
            ├── DS_node_attributes.txt
            ├── DS_edge_labels.txt
            ├── DS_edge_attributes.txt
            ├── DS_graph_labels.txt
            ├── DS_graph_attributes.txt
            ├──  ...
            └── dataset_name.kgcnn.pickle

    Setting up a single file can be made additionally with base class :obj:`save` method.
    """

[docs]    def __init__(self, data_directory: str = None, dataset_name: str = None, file_name: str = None,
                 file_directory: str = None, verbose: int = 10):
        r"""Initialize a :obj:`GraphTUDataset` instance from file.

        Args:
            data_directory (str): Full path to directory of the dataset. Default is None.
            file_name (str): Filename for reading into memory. Not used for general TUDataset, since there are multiple
                files with a prefix and pre-defined suffix. Default is None.
            file_directory (str): Name or relative path from :obj:`data_directory` to a directory containing sorted
                files. Default is None.
            dataset_name (str): Name of the dataset. Important for base-name for naming of files. Default is None.
            verbose (int): Logging level. Default is 10.
        """
        MemoryGraphDataset.__init__(self, data_directory=data_directory, dataset_name=dataset_name,
                                    file_name=file_name, verbose=verbose, file_directory=file_directory)

[docs]    def read_in_memory(self):
        r"""Read the :obj:`GraphTUDataset` into memory.

        The TUDataset is stored in disjoint representations. The data is cast
        to a list of separate graph properties for `MemoryGraphDataset`.

        Returns:
            self
        """
        if self.dataset_name is not None and self.data_directory is not None:
            path = os.path.realpath(self.data_directory)
            name_dataset = self.dataset_name
            if self.file_directory is not None:
                path = os.path.join(path, self.file_directory)
        else:
            self.error("Dataset needs name {0} and path {1}.".format(self.dataset_name, self.data_directory))
            return self

        self.info("Reading dataset to memory with name %s" % str(self.dataset_name))

        # Define a graph with indices
        # They must be defined
        g_a = np.array(self.read_csv_simple(os.path.join(path, name_dataset + "_A.txt"), dtype=int), dtype="int")
        g_n_id = np.array(self.read_csv_simple(os.path.join(path, name_dataset + "_graph_indicator.txt"), dtype=int),
                          dtype="int")

        # Try read in labels.
        try:
            g_labels = np.array(
                self.read_csv_simple(os.path.join(path, name_dataset + "_graph_labels.txt"), dtype=float))
        except FileNotFoundError:
            g_labels = None
        try:
            n_labels = np.array(
                self.read_csv_simple(os.path.join(path, name_dataset + "_node_labels.txt"), dtype=float))
        except FileNotFoundError:
            n_labels = None
        try:
            e_labels = np.array(
                self.read_csv_simple(os.path.join(path, name_dataset + "_edge_labels.txt"), dtype=float))
        except FileNotFoundError:
            e_labels = None

        # Try read in attributes
        try:
            n_attr = np.array(
                self.read_csv_simple(os.path.join(path, name_dataset + "_node_attributes.txt"), dtype=float))
        except FileNotFoundError:
            n_attr = None
        try:
            e_attr = np.array(
                self.read_csv_simple(os.path.join(path, name_dataset + "_edge_attributes.txt"), dtype=float))
        except FileNotFoundError:
            e_attr = None
        try:
            g_attr = np.array(
                self.read_csv_simple(os.path.join(path, name_dataset + "_graph_attributes.txt"), dtype=float))
        except FileNotFoundError:
            g_attr = None

        # labels
        num_graphs = np.amax(g_n_id)
        if g_labels is not None:
            if len(g_labels) != num_graphs:
                self.error(
                    "Wrong number of graphs, not matching graph labels, {0}, {1}.".format(len(g_labels), num_graphs))

        # shift index, should start at 0 for python indexing
        if int(np.amin(g_n_id)) == 1 and int(np.amin(g_a)) == 1:
            self.info("Shift start of graph ID to zero for '%s' to match python indexing." % name_dataset)
            g_a = g_a - 1
            g_n_id = g_n_id - 1

        # split into separate graphs
        graph_id, counts = np.unique(g_n_id, return_counts=True)
        graph_len = np.zeros(num_graphs, dtype="int")
        graph_len[graph_id] = counts

        if n_attr is not None:
            n_attr = np.split(n_attr, np.cumsum(graph_len)[:-1])
        if n_labels is not None:
            n_labels = np.split(n_labels, np.cumsum(graph_len)[:-1])

        # edge_indicator
        graph_id_edge = g_n_id[g_a[:, 0]]  # is the same for adj_matrix[:,1]
        graph_id2, counts_edge = np.unique(graph_id_edge, return_counts=True)
        edge_len = np.zeros(num_graphs, dtype="int")
        edge_len[graph_id2] = counts_edge

        if e_attr is not None:
            e_attr = np.split(e_attr, np.cumsum(edge_len)[:-1])
        if e_labels is not None:
            e_labels = np.split(e_labels, np.cumsum(edge_len)[:-1])

        # edge_indices
        node_index = np.concatenate([np.arange(x) for x in graph_len], axis=0)
        edge_indices = node_index[g_a]
        edge_indices = np.concatenate([edge_indices[:, 1:], edge_indices[:, :1]], axis=-1)  # switch indices
        edge_indices = np.split(edge_indices, np.cumsum(edge_len)[:-1])

        # Check if unconnected
        all_cons = []
        for i in range(num_graphs):
            cons = np.arange(graph_len[i])
            test_cons = np.sort(np.unique(cons[edge_indices[i]].flatten()))
            is_cons = np.zeros_like(cons, dtype="bool")
            is_cons[test_cons] = True
            all_cons.append(np.sum(np.invert(is_cons)))
        all_cons = np.array(all_cons)

        self.info("Graph index which has unconnected '%s' with '%s' in total '%s'." % (
            np.arange(len(all_cons))[all_cons > 0], all_cons[all_cons > 0], len(all_cons[all_cons > 0])))

        node_degree = [np.zeros(x, dtype="int") for x in graph_len]
        for i, x in enumerate(edge_indices):
            nod_id, nod_counts = np.unique(x[:, 0], return_counts=True)
            node_degree[i][nod_id] = nod_counts

        # Assert list for graph items.
        g_attr = [x for x in g_attr] if g_attr is not None else None
        g_labels = [x for x in g_labels] if g_labels is not None else None

        # Assign to self.
        for key, value in {"node_degree": node_degree, "node_attributes": n_attr, "node_labels": n_labels,
                           "edge_attributes": e_attr, "edge_indices": edge_indices, "edge_labels": e_labels,
                           "graph_attributes": g_attr, "graph_labels": g_labels}.items():
            self.assign_property(key, value)

        return self

[docs]    @staticmethod
    def read_csv_simple(filepath: str, delimiter: str = ",", dtype: Callable = float):
        """Very simple python-only function to read in a csv-file from file.

        Args:
            filepath (str): Full filepath of csv-file to read in.
            delimiter (str): Delimiter character for separation. Default is ",".
            dtype: Callable type conversion from string. Default is float.

        Returns:
            list: Python list of values. Length of the list equals the number of lines.
        """
        out = []
        open_file = open(filepath, "r")
        for lines in open_file.readlines():
            string_list = lines.strip().split(delimiter)
            values_list = [dtype(x.strip()) for x in string_list]
            out.append(values_list)
        open_file.close()
        return out