Source code for kgcnn.data.transform.scaler.standard

import os.path
import numpy as np
import logging
from typing import List, Dict
from sklearn.preprocessing import StandardScaler as StandardScalerSklearn
from kgcnn.data.utils import save_json_file, load_json_file


logging.basicConfig()  # Module logger
module_logger = logging.getLogger(__name__)
module_logger.setLevel(logging.INFO)


[docs]class _StandardScalerSklearnMixin:
    r"""Mixin class for scaler of :obj:`sklearn` with added functionality to save and load weights of a scaler
    similar to keras layers and objects.

    .. note::

        This class is only meant to add functionality. Scaler is accessed via :obj:`_scaler_reference` property.

    """
    _attributes_list_sklearn = ["n_features_in_", "mean_", "scale_", "var_", "feature_names_in_", "n_samples_seen_"]

    def __init__(self):
        self._scaler_reference = None
        self._x_name = None
        self._sample_weight_name = None

[docs]    def get_scaling(self):
        """Get scale of shape (1, n_properties)."""
        if not hasattr(self._scaler_reference, "scale_"):
            return
        scale = np.array(self._scaler_reference.scale_)
        scale = np.expand_dims(scale, axis=0)
        return scale

[docs]    def get_mean_shift(self):
        """Get scale of shape (1, n_properties)."""
        if not hasattr(self._scaler_reference, "mean_"):
            return
        mean = np.array(self._scaler_reference.mean_)
        mean = np.expand_dims(mean, axis=0)
        return mean

[docs]    def get_config(self) -> dict:
        """Get configuration for scaler."""
        config = self._scaler_reference.get_params()
        return config

[docs]    def set_config(self, config: dict):
        """Set configuration for scaler.

        Args:
            config (dict): Config dictionary.
        """
        self._scaler_reference.set_params(**config)

[docs]    def get_weights(self) -> dict:
        """Get weights for this scaler after fit."""
        weight_dict = dict()
        for x in self._attributes_list_sklearn:
            if hasattr(self._scaler_reference, x):
                value = getattr(self._scaler_reference, x)
                value_update = {x: np.array(value).tolist()} if value is not None else {x: value}
                weight_dict.update(value_update)
        return weight_dict

[docs]    def set_weights(self, weights: dict):
        """Set weights for this scaler.

        Args:
            weights (dict): Weight dictionary.
        """
        for item, value in weights.items():
            if item in self._attributes_list_sklearn:
                setattr(self._scaler_reference, item, np.array(value))
            else:
                module_logger.warning("`StandardScaler` got unknown weight '%s'." % item)

[docs]    def save_weights(self, file_path: str):
        """Save weights as numpy to file.

        Args:
            file_path: Filepath to save weights.
        """
        weights = self.get_weights()
        # Make them all numpy arrays for save.
        for key, value in weights.items():
            weights[key] = np.array(value)
        if len(weights) > 0:
            np.savez(os.path.splitext(file_path)[0] + ".npz", **weights)
        else:
            module_logger.warning("Error, no weights to save for `StandardScaler`.")

[docs]    def save(self, file_path: str):
        """Save scaler serialization to file.

        Args:
            file_path: Filepath to save scaler serialization.
        """
        conf = self.get_config()
        weights = self.get_weights()
        full_info = {"class_name": type(self).__name__, "module_name": type(self).__module__,
                     "config": conf, "weights": weights}
        save_json_file(full_info, os.path.splitext(file_path)[0] + ".json")

[docs]    def load(self, file_path: str):
        """Load scaler serialization from file.

        Args:
            file_path: Filepath to load scaler serialization.
        """
        full_info = load_json_file(file_path)
        # Could verify class_name and module_name here.
        self.set_config(full_info["config"])
        self.set_weights(full_info["weights"])
        return self

    # Similar functions that work on dataset plus property names.
    # noinspection PyPep8Naming
[docs]    def fit_dataset(self, dataset: List[Dict[str, np.ndarray]]):
        r"""Fit to dataset with relevant `X` , `y` information.

        Args:
            dataset (list): Dataset of type `List[Dict]` with dictionary of numpy arrays.

        Returns:
            self.
        """
        return self._scaler_reference.fit(
            [item[self._x_name] for item in dataset],
            # We can ignore y here. None is default for sklearn StandardScaler.
            # y=None
            sample_weight=[
                item[self._sample_weight_name] for item in dataset] if self._sample_weight_name is not None else None
        )

    # noinspection PyPep8Naming
[docs]    def transform_dataset(self, dataset: List[Dict[str, np.ndarray]],
                          copy: bool = True,
                          copy_dataset: bool = False,
                          ) -> List[Dict[str, np.ndarray]]:
        r"""Transform dataset with relevant `X` information.

        Args:
            dataset (list): Dataset of type `List[Dict]` with dictionary of numpy arrays.
            copy (bool): Whether to copy data for transformation. Default is True.
            copy_dataset (bool): Whether to copy full dataset. Default is False.

        Returns:
            dataset: Transformed dataset.
        """
        if copy_dataset:
            dataset = dataset.copy()
        out = self._scaler_reference.transform(
            [graph[self._x_name] for graph in dataset],
            copy=copy,
        )
        for graph, out_value in zip(dataset, out):
            graph[self._x_name] = out_value
        return dataset

    # noinspection PyPep8Naming
[docs]    def inverse_transform_dataset(self, dataset: List[Dict[str, np.ndarray]],
                                  copy: bool = True,
                                  copy_dataset: bool = False,
                                  ) -> List[Dict[str, np.ndarray]]:
        r"""Inverse transform dataset with relevant `X` , `y` information.

        Args:
            dataset (list): Dataset of type `List[Dict]` with dictionary of numpy arrays.
            copy (bool): Whether to copy data for transformation. Default is True.
            copy_dataset (bool): Whether to copy full dataset. Default is False.

        Returns:
            dataset: Inverse-transformed dataset.
        """
        if copy_dataset:
            dataset = dataset.copy()
        out = self._scaler_reference.inverse_transform(
            [graph[self._x_name] for graph in dataset],
            copy=copy,
        )
        for graph, out_value in zip(dataset, out):
            graph[self._x_name] = out_value
        return dataset

    # noinspection PyPep8Naming
[docs]    def fit_transform_dataset(self, dataset: List[Dict[str, np.ndarray]],
                              copy: bool = True,
                              copy_dataset: bool = False
                              ) -> List[Dict[str, np.ndarray]]:
        r"""Fit and transform to dataset with relevant `X` , `y` information.

        Args:
            dataset (list): Dataset of type `List[Dict]` with dictionary of numpy arrays.
            copy (bool): Whether to copy data for transformation. Default is True.
            copy_dataset (bool): Whether to copy full dataset. Default is False.

        Returns:
            dataset: Transformed dataset.
        """
        self.fit_dataset(dataset=dataset)
        return self.transform_dataset(dataset=dataset, copy=copy, copy_dataset=copy_dataset)

    @property
    def scale_(self):
        return self._scaler_reference.scale_


[docs]class StandardScaler(_StandardScalerSklearnMixin):
    r"""Standard scaler that uses obj:`sklearn.preprocessing.StandardScaler` .
    Included unused kwarg 'atomic_number' to be compatible with some material oriented scaler.

    .. code-block:: python

        import numpy as np
        from kgcnn.data.transform.scaler.standard import StandardScaler
        data = np.random.rand(5).reshape((5,1))
        scaler = StandardScaler()
        scaler.fit(X=data)
        print(scaler.get_weights())
        print(scaler.get_config())
        print(scaler.inverse_transform(scaler.transform(X=data)))
        print(data)
        scaler.save("example.json")
        new_scaler = StandardScaler()
        new_scaler.load("example.json")
        print(new_scaler.inverse_transform(scaler.transform(X=data)))

    """
    # noinspection PyPep8Naming
    def __init__(self, *, X: str = "graph_attributes", sample_weight: str = None,
                 copy=True, with_mean=True, with_std=True):
        super(StandardScaler, self).__init__()
        self._scaler_reference = StandardScalerSklearn(copy=copy, with_mean=with_mean, with_std=with_std)
        self._x_name = X
        self._sample_weight_name = sample_weight

    # noinspection PyPep8Naming
[docs]    def fit(self, X, *, y=None, sample_weight=None, atomic_number=None):
        """Compute the mean and std to be used for later scaling.

        Args:
            X (np.ndarray): Array of shape (n_samples, n_features)
                The data used to compute the mean and standard deviation
                used for later scaling along the feature's axis.
            y (None): Ignored.
            sample_weight (np.ndarray): Individual weights for each sample.
            atomic_number (list, None): Ignored.

        Returns:
            self: Fitted scaler.
        """
        return self._scaler_reference.fit(X=X, y=y, sample_weight=sample_weight)

    # noinspection PyPep8Naming
[docs]    def partial_fit(self, X, y=None, sample_weight=None, atomic_number=None):
        r"""Online computation of mean and std on X for later scaling.
        All of X is processed as a single batch. This is intended for cases
        when :meth:`fit` is not feasible due to very large number of
        `n_samples` or because X is read from a continuous stream.
        The algorithm for incremental mean and std is given in Equation 1.5a,b
        in `Chan, et al. (1982) <https://www.tandfonline.com/doi/abs/10.1080/00031305.1983.10483115>`__ .

        Args:
            X (np.ndarray): Array of shape (n_samples, n_features)
                The data used to compute the mean and standard deviation
                used for later scaling along the feature's axis.
            y (np.ndarray, None): Ignored.
            sample_weight (np.ndarray): Array-like of shape (n_samples,), default=None
                Individual weights for each sample.
            atomic_number (list): Not used.

        Returns:
            self: Fitted scaler.
        """
        # For partial fit internally uses args and not kwargs.
        # Can not request kwargs after argument X here.
        return self._scaler_reference.partial_fit(X=X, y=y, sample_weight=sample_weight)

    # noinspection PyPep8Naming
[docs]    def fit_transform(self, X, *, y=None, atomic_number=None, **fit_params):
        r"""Perform fit and standardization by centering and scaling.

        Args:
            X (np.ndarray): Array of shape (n_samples, n_features).
                The data used to scale along the feature's axis.
            y (np.ndarray, None): Ignored.
            atomic_number (list): Not used.
            fit_params: Additional fit kwargs.

        Returns:
            X_tr (np.ndarray): Transformed array of shape (n_samples, n_features).
        """
        return self._scaler_reference.fit_transform(X=X, y=y, **fit_params)

    # noinspection PyPep8Naming
[docs]    def transform(self, X, *, copy=None, atomic_number=None):
        """Perform standardization by centering and scaling.

        Args:
            X (np.ndarray): Array of shape (n_samples, n_features).
                The data used to scale along the feature's axis.
            copy (bool): Copy the input X or not.
            atomic_number (list): Not used.

        Returns:
            X_tr (np.ndarray): Transformed array of shape (n_samples, n_features).
        """
        return self._scaler_reference.transform(X=X, copy=copy)

    # noinspection PyPep8Naming
[docs]    def inverse_transform(self, X, *, copy: bool = None, atomic_number=None):
        r"""Scale back the data to the original representation.

        Args:
            X (np.ndarray): Array of shape (n_samples, n_features).
                The data used to scale along the feature's axis.
            copy (bool): Copy the input X or not.
            atomic_number (list): Not used.

        Returns:
            X_tr (np.ndarray): Transformed array of shape (n_samples, n_features).
        """
        return self._scaler_reference.inverse_transform(X=X, copy=copy)

[docs]    def get_config(self) -> dict:
        config = super(StandardScaler, self).get_config()
        config.update({"X": self._x_name})
        return config


[docs]class StandardLabelScaler(_StandardScalerSklearnMixin):
    r"""Standard scaler for labels that has a member of :obj:`sklearn.preprocessing.StandardScaler` .
    Included unused kwarg 'atomic_number' to be compatible with some material oriented scaler.
    Uses `y` argument for scaling labels and `X` is ignored.

    .. code-block:: python

        import numpy as np
        from kgcnn.data.transform.scaler.standard import StandardLabelScaler
        data = np.random.rand(5).reshape((5,1))
        scaler = StandardLabelScaler()
        scaler.fit(y=data)
        print(scaler.fit_transform(y=data))
        print(scaler.get_weights())
        print(scaler.get_config())
        print(scaler.inverse_transform(y=scaler.transform(y=data)))
        print(data)
        scaler.save("example.json")
        new_scaler = StandardLabelScaler()
        new_scaler.load("example.json")
        print(new_scaler.inverse_transform(y=scaler.transform(y=data)))

    """

    def __init__(self, *, y: str = "graph_labels", sample_weight: str = None, copy=True, with_mean=True, with_std=True):
        super(StandardLabelScaler, self).__init__()
        self._scaler_reference = StandardScalerSklearn(copy=copy, with_mean=with_mean, with_std=with_std)
        self._x_name = y
        self._sample_weight_name = sample_weight

    def _validate_input(self, y, x):
        if x is not None and y is None:
            raise ValueError(
                "Got X but y is 'None' for `%s`. Report this issue if sklearn API change. " % type(self).__name__)
        if y is None and x is None:
            raise ValueError(
                "Require labels as `y` for `%s`. Input must be e.g. 'fit(data)'." % type(self).__name__)

    # noinspection PyPep8Naming
[docs]    def fit(self, y: np.ndarray, *, X=None, sample_weight=None, atomic_number=None):
        r"""Compute the mean and std to be used for later scaling.

        Args:
            y (np.ndarray): Array of shape (n_samples, n_labels)
                The data used to compute the mean and standard deviation
                used for later scaling along the feature's axis.
            X (None): Ignored.
            sample_weight (np.ndarray): Individual weights for each sample.
            atomic_number (list): Ignored.

        Returns:
            self: Fitted scaler.
        """
        # fit() of sklearn uses reset and partial fit. Just adding y in place of X.
        self._validate_input(y, X)
        return self._scaler_reference.fit(X=y, sample_weight=sample_weight)

    # noinspection PyPep8Naming
[docs]    def partial_fit(self, y: np.ndarray, X=None, sample_weight=None, atomic_number=None):
        r"""Online computation of mean and std on y for later scaling.
        All of y is processed as a single batch. This is intended for cases
        when :meth:`fit` is not feasible due to very large number of
        `n_samples` or because y is read from a continuous stream.
        The algorithm for incremental mean and std is given in Equation 1.5a,b
        in `Chan, et al. (1982) <https://www.tandfonline.com/doi/abs/10.1080/00031305.1983.10483115>`__ .

        Args:
            y (np.ndarray): Array of shape (n_samples, n_labels)
                The data used to compute the mean and standard deviation
                used for later scaling along the feature's axis.
            X (None): Ignored.
            sample_weight (np.ndarray): Individual weights for each sample.
            atomic_number (list): Ignored.

        Returns:
            self: Fitted scaler.
        """
        # For partial fit internally uses args and not kwargs.
        # Can not request kwargs after argument X, y here.
        # Just changing order of x,y here.
        self._validate_input(y, X)
        return self._scaler_reference.partial_fit(X=y, sample_weight=sample_weight)

    # noinspection PyPep8Naming
[docs]    def fit_transform(self, y: np.ndarray, *, X=None, atomic_number=None, copy=None, **fit_params):
        r"""Perform fit and standardization by centering and scaling.

        Args:
            y (np.ndarray): Array of shape (n_samples, n_labels)
                The data used to compute the mean and standard deviation
                used for later scaling along the feature's axis.
            X (None): Ignored.
            atomic_number (list): Ignored.
            copy (bool): Copy the input `y` or not.
            fit_params (Any): Kwargs for fit.

        Returns:
            y_tr (np.ndarray): Transformed array of shape (n_samples, n_labels).
        """
        self.fit(y=y, X=X, atomic_number=atomic_number, **fit_params)
        return self.transform(y=y, X=X, copy=copy, atomic_number=atomic_number)

    # noinspection PyPep8Naming
[docs]    def transform(self, y: np.ndarray, *, X=None, copy=None, atomic_number=None):
        r"""Perform standardization by centering and scaling.

        Args:
            y (None): Array of shape (n_samples, n_labels)
                The data used to scale along the feature's axis.
            X (None): Ignored.
            atomic_number (list): Ignored.
            copy (bool): Copy the input `y` or not.

        Returns:
            y_tr (np.ndarray): Transformed array of shape (n_samples, n_labels).
        """
        # Just changing order of x,y here.
        return self._scaler_reference.transform(y, copy=copy)

    # noinspection PyPep8Naming
[docs]    def inverse_transform(self, y: np.ndarray = None, *, X=None, copy: bool = None, atomic_number=None):
        r"""Scale back the data to the original representation.

        Args:
            y (None): Array of shape (n_samples, n_labels)
                The data used to scale along the feature's axis.
            X (np.ndarray, None): Ignored. Default is None.
            atomic_number (list): Ignored.
            copy (bool): Copy the input `y` or not.

        Returns:
            y_tr (np.ndarray): Transformed array of shape (n_samples, n_labels).
        """
        # Just changing order of x,y here.
        return self._scaler_reference.inverse_transform(y, copy=copy)

[docs]    def get_config(self) -> dict:
        config = super(StandardLabelScaler, self).get_config()
        config.update({"y": self._x_name})
        return config