Source code for geomstats.learning.aac

"""Align All and Compute for Graph Space.

Lead author: Anna Calissano.
"""

import logging
import random

from sklearn.base import BaseEstimator

import geomstats.backend as gs
from geomstats.errors import check_parameter_accepted_values
from geomstats.learning._sklearn import PCA, LinearRegression
from geomstats.learning.frechet_mean import FrechetMean


def _warn_max_iterations(iteration, max_iter):
    if iteration + 1 == max_iter:
        logging.warning(
            f"Maximum number of iterations {max_iter} reached. "
            "The estimate may be inaccurate"
        )


class _AACFrechetMean(BaseEstimator):
    r"""Class AAC for Frechet Mean on Graph Space.

    The Align All and Compute (AAC) algorithm for Frechet Mean (FM)estimation is
    introduced in [Calissano2020]_ and it estimates the Frechet Mean for a set of
    labeled or unlabeled graphs. The idea is to optimally aligned the graphs to the
    current mean estimator using the optimal alignment between the graphs and the mean
    (graph matching between two graphs) and compute the mean estimation between the
    aligned adjacency matrices (the arithmetic mean in the euclidean space of dimension
    :math:`nodes \times nodes`). The algorithm stops as soon as the distance between two
    consecutive estimations is lower then :math:`\epsilon` or the maximum number of
    iterations is reached. The initialization step consists in aligning the data
    with respect to an initial point.

    Parameters
    ----------
    space : GraphSpace
        Graph space total space with a quotient structure.
    epsilon: float, default=1e-6
        Stopping criterion for the estimation step, i.e., the distance between two
        consecutive estimators.
    max_iter: int, default = 20
        Stopping criterion on the maximum number of iterations.
    init_point: array-like, shape=[n_nodes, n_nodes] or GraphPoint, default random.
        Algorithm initialization.
    save_last_X: bool, default = True
        Flag to save the data as aligned in the last algorithm iteration.
    total_space_estimator_kwargs : dict
        Total space estimator keyword arguments.

    Attributes
    ----------
    total_space_estimator : BaseEstimator
        Frechet mean estimator in total space.
    estimate_ : array-like, mean=[n_nodes, n_nodes]
        Mean.
    n_iter_ : int
        Number of performed iterations.
    aligned_X_: array-like, shape=[n_samples, n_nodes, n_nodes] or set of GraphPoint.
        Set of aligned data as after the last call of fit.
        Saved if ``save_last_X is True``.

    References
    ----------
    .. [Calissano2020]  Calissano, A., Feragen, A., Vantini, S.
        “Graph Space: Geodesic Principal Components for a Population of
        Network-valued Data.” Mox report 14, 2020.
        https://mox.polimi.it/reports-and-theses/publication-results/?id=855.
    """

    def __init__(
        self,
        space,
        *,
        epsilon=1e-3,
        max_iter=20,
        init_point=None,
        total_space_estimator_kwargs=None,
        save_last_X=True,
    ):
        self.space = space
        self.epsilon = epsilon
        self.max_iter = max_iter
        self.init_point = init_point
        self.save_last_X = save_last_X

        self.total_space_estimator_kwargs = total_space_estimator_kwargs or {}
        self.total_space_estimator = FrechetMean(
            self.space, **self.total_space_estimator_kwargs
        )

        self.estimate_ = None
        self.n_iter_ = None
        self.aligned_X_ = None

    def fit(self, X, y=None):
        """Fit the Frechet Mean.

        Parameters
        ----------
        X : array-like, shape=[n_samples, n_nodes, n_nodes].
            Dataset to estimate the FM.
        y : Ignored
            Ignored.

        Returns
        -------
        self : object
            Returns self.
        """
        previous_estimate = (
            random.choice(X) if self.init_point is None else self.init_point
        )
        aligned_X = X

        for iteration in range(self.max_iter):
            aligned_X = self.space.aligner.align(aligned_X, previous_estimate)
            new_estimate = self.total_space_estimator.fit(aligned_X).estimate_

            error = self.space.metric.dist(previous_estimate, new_estimate)
            if error < self.epsilon:
                break

            previous_estimate = new_estimate
        else:
            _warn_max_iterations(iteration, self.max_iter)

        if self.save_last_X:
            self.aligned_X_ = aligned_X
        self.estimate_ = new_estimate
        self.n_iter_ = iteration

        return self


class _AACGGPCA(BaseEstimator):
    r"""Class AAC for Generalized Geodesic Principal Components (GGPCA) on Graph Space.

    The Align All and Compute (AAC) algorithm for GGPCA estimation is
    introduced in [Calissano2020]_ and it estimates the GGPCA for a set of
    labeled or unlabeled graphs. The idea is to optimally aligned the graphs to the
    current GGPCA estimator using the optimal alignment between the graphs and the
    geodesics and then compute the GGPCA estimation between the aligned adjacency
    matrices (the PCA in the euclidean space of dimension :math:`nodes \times nodes`).
    The algorithm stops as soon as the percentage of variance explained by PCA in two
    consecutive estimations is lower then :math:`\epsilon` or the maximum number of
    iteration is reached. The initialization step consists in aligning all the data with
    respect to an initial point.

    Parameters
    ----------
    space : GraphSpace
        Graph space total space with a quotient structure.
    epsilon: float, default=1e-6
        Stopping criterion for the estimation step, i.e., the distance between two
        consecutive estimators.
    max_iter: int, default = 20
        Stopping criterion on the maximum number of iterations.
    init_point: array-like, shape=[n_nodes, n_nodes] or GraphPoint, default random.
        Algorithm initialization.
    n_components: int
        Number of principal components to be estimated. Notice that the convergence is
        ensured only for the first principal component.
    save_last_X: bool, default = True
        Flag to save the data as aligned in the last algorithm iteration.

    Attributes
    ----------
    total_space_estimator: BaseEstimator
        Method for the estimation of the PCA for a set of flattened adjacency matrices
        in the total space. Check geomstats.learning._sklearn_wrapper for details.
        Default: ``sklearn.decomposition.PCA``.
    n_iter_ : int
        Number of performed iterations.
    aligned_X_: array-like, shape=[n_samples, n_nodes, n_nodes] or set of GraphPoint.
        Set of aligned data as after the last call of fit.
        Saved if ``save_last_X is True``.

    References
    ----------
    .. [Calissano2020]  Calissano, A., Feragen, A., Vantini, S.
        “Graph Space: Geodesic Principal Components for a Population of
        Network-valued Data.” Mox report 14, 2020.
        https://mox.polimi.it/reports-and-theses/publication-results/?id=855.
    """

    def __init__(
        self,
        space,
        *,
        n_components=2,
        epsilon=1e-3,
        max_iter=20,
        init_point=None,
        save_last_X=True,
    ):
        self.space = space
        self.epsilon = epsilon
        self.max_iter = max_iter
        self.init_point = init_point
        self.n_components = n_components
        self.save_last_X = save_last_X

        self.total_space_estimator = PCA(n_components=self.n_components)
        self.n_iter_ = None
        self.aligned_X_ = None

    @property
    def components_(self):
        """Principal Components in the total space.

        GGPCA expressed as vectors in the total space.
        """
        return self.total_space_estimator.reshaped_components_

    @property
    def explained_variance_(self):
        """Variance Explained along the GGPCA."""
        return self.total_space_estimator.explained_variance_

    @property
    def explained_variance_ratio_(self):
        """Percentage of Variance Explained along the GGPCA."""
        return self.total_space_estimator.explained_variance_ratio_

    @property
    def mean_(self):
        """Mean at the last iteration."""
        return self.total_space_estimator.reshaped_mean_

    def fit(self, X, y=None):
        """Fit the GGPCA.

        Parameters
        ----------
        X : array-like, shape=[n_samples, n_nodes, n_nodes] or set of GraphPoint.
            Dataset to estimate the GGPCA.
        y : Ignored
            Ignored.

        Returns
        -------
        self : object
            Returns self.

        Note: Default method in the total space is sklearn.decomposition.PCA where
        the input data are centered but not scaled for each feature.
        """
        x = random.choice(X) if self.init_point is None else self.init_point
        aligned_X = self.space.aligner.align(X, x)

        self.total_space_estimator.fit(aligned_X)
        previous_expl = self.total_space_estimator.explained_variance_ratio_[0]

        for iteration in range(self.max_iter):
            mean = self.total_space_estimator.reshaped_mean_
            direc = self.total_space_estimator.reshaped_components_[0]

            geodesic = self.space.metric.geodesic(
                initial_point=mean, initial_tangent_vec=direc
            )

            aligned_X = self.space.aligner.align_point_to_geodesic(geodesic, aligned_X)
            self.total_space_estimator.fit(aligned_X)
            expl_ = self.total_space_estimator.explained_variance_ratio_[0]

            error = gs.abs(expl_ - previous_expl)
            if error < self.epsilon:
                break
            previous_expl = expl_
        else:
            _warn_max_iterations(iteration, self.max_iter)

        if self.save_last_X:
            self.aligned_X_ = aligned_X
        self.n_iter_ = iteration

        return self


class _AACRegression(BaseEstimator):
    r"""Class AAC for Generalized Geodesic Regression (GGR) on Graph Space.

    The Align All and Compute (AAC) algorithm for GGR estimation is
    introduced in [Calissano2022] and it estimates the GGR for
    :math:`\{(s_i, X_i)\in \mathbb{R}^p\times X/T\}` a set of labeled or unlabeled
    graphs as output and a set of scalar or vector as input:
    :math:`f: \mathbb{R}^p \rightarrow X/T`. The idea is to iteratively estimate a OLS
    regression model between a set of regressors and a set of flattened adjacency
    matrices and align the graphs to the current GGR estimator using the optimal
    alignment for regression. The optimal alignment for regression consists in aligning
    the graph with the corresponding predicted graph along the regression model to
    decrease the prediction error. The algorithm stops as soon as the loss in two
    consecutive estimations is lower then math:`\epsilon` or the maximum number of
    iteration is reached. The initialization step consists in aligning all the data with
    respect to a initial point.

    Parameters
    ----------
    space : GraphSpace
        Graph space total space with a quotient structure.
    epsilon: float, default=1e-6
        Stopping criterion for the estimation step, i.e., the distance between loss
        function in two consecutive estimation steps.
    max_iter: int, default = 20
        Stopping criterion on the maximum number of iterations.
    init_point: array-like, shape=[n_nodes, n_nodes] or GraphPoint, default random.
        Algorithm initialization.
    save_last_y: bool, default=True
        Flag to save the data as aligned in the last algorithm iteration.
    total_space_estimator_kwargs : dict
        Total space estimator keyword arguments.

    Attributes
    ----------
    total_space_estimator: BaseEstimator
        Method for the estimation of the OLS Regression for a set of flattened adjacency
        matrices in the total space.
        Check geomstats.learning._sklearn_wrapper for details.
        Default: ``sklearn.linear_model.LinearRegression``.
    aligned_y_: array-like, shape=[n_samples, n_nodes, n_nodes] or set of GraphPoint.
        Set of aligned data as after the last call of fit.
        Saved if ``self.save_last_y is True``.

    References
    ----------
    .. [Calissano2022]  Calissano, A., Feragen, A., Vantini, S.
        “Graph-valued regression: prediction of unlabelled networks in a non-Euclidean
        Graph Space.”Journal of Multivariate Analysis 190 - 104950, (2022).
        https://doi.org/10.1016/j.jmva.2022.104950.
    """

    def __init__(
        self,
        space,
        *,
        epsilon=1e-3,
        max_iter=20,
        init_point=None,
        total_space_estimator_kwargs=None,
        save_last_y=True,
    ):
        self.space = space
        self.epsilon = epsilon
        self.max_iter = max_iter
        self.init_point = init_point
        self.save_last_y = save_last_y

        self.total_space_estimator_kwargs = total_space_estimator_kwargs or {}
        self.total_space_estimator = LinearRegression(
            **self.total_space_estimator_kwargs
        )
        self.n_iter_ = None
        self.aligned_y_ = None

    def fit(self, X, y):
        """Fit the Generalized Geodesic Regression.

        Parameters
        ----------
        X : array-like, shape=[n_samples, p].
            Dataset of regressors to estimate the GGR.
        y : array-like, shape=[n_samples, n_nodes, n_nodes] or set of GraphPoint.
            Dataset to estimate the GGR.

        Returns
        -------
        self : object
            Returns self.
        """
        y_ = random.choice(y) if self.init_point is None else self.init_point
        aligned_y = self.space.aligner.align(y, y_)

        previous_pred_dist = 1e6
        for iteration in range(self.max_iter):
            self.total_space_estimator.fit(X, aligned_y)
            y_pred = self.total_space_estimator.predict(X)

            aligned_y = self.space.aligner.align(aligned_y, y_pred)
            pred_dist = gs.sum(self.space.metric.dist(y_pred, aligned_y))

            dist_diff = gs.abs(previous_pred_dist - pred_dist)
            if dist_diff < self.epsilon:
                break

            previous_pred_dist = pred_dist
        else:
            _warn_max_iterations(iteration, self.max_iter)

        if self.save_last_y:
            self.aligned_y_ = aligned_y
        self.n_iter_ = iteration

        return self

    def predict(self, X):
        """Predict using the generalized geodesic regression.

        Predict a graph or a set of graphs corresponding to the given regressors. It
        uses the total space prediction.

        Parameters
        ----------
        X : array-like, shape=[n_samples, n_nodes, n_nodes] or set of GraphPoint
            Dataset to estimate the GGR.

        Returns
        -------
        prediction : array-like, shape=[n_samples, n_nodes, n_nodes] or set of
            GraphPoint
            Predicted unlabeled graphs.
        """
        return self.total_space_estimator.predict(X)


[docs] class AAC: r"""Class for Align all and Compute algorithm on Graph Space. The Align All and Compute (AAC) algorithm is introduced in [Calissano2020]_ and it allows to compute different statistical estimators: the Frechet Mean, the Generalized Geodesic Principal components and the Regression for a set of labeled or unlabeled graphs. The idea is to optimally aligned the graphs to the current estimator using the correct alignment technique and compute the current estimation using the geometrical property of the total space, i.e., the Euclidean space of adjacency matrices. Parameters ---------- space : GraphSpace Graph space total space with a quotient structure. estimate : str Desired estimator. One of the following: - "frechet_mean": Frechet Mean estimation [Calissano2020]_ - "ggpca": Generalized Geodesic Principal Components [Calissano2020]_ - "regression": Graph-on-vector regression model [Calissano2022]_ Examples -------- Available example on Graph Space: :mod:`notebooks.19_practical_methods__aac` Available example on Graph Space with real world data: :mod:`notebooks.20_real_world_application__graph_space` References ---------- .. [Calissano2020] Calissano, A., Feragen, A., Vantini, S. “Graph Space: Geodesic Principal Components for a Population of Network-valued Data.” Mox report 14, 2020. https://mox.polimi.it/reports-and-theses/publication-results/?id=855. .. [Calissano2022] Calissano, A., Feragen, A., Vantini, S. “Graph-valued regression: prediction of unlabelled networks in a non-Euclidean Graph Space.”Journal of Multivariate Analysis 190 - 104950, (2022). https://doi.org/10.1016/j.jmva.2022.104950. """ MAP_ESTIMATE = { "frechet_mean": _AACFrechetMean, "ggpca": _AACGGPCA, "regression": _AACRegression, } def __new__(cls, space, *, estimate="frechet_mean", **kwargs): """Class for Align all and Compute algorithm on Graph Space.""" check_parameter_accepted_values( estimate, "estimate", list(cls.MAP_ESTIMATE.keys()) ) return cls.MAP_ESTIMATE[estimate](space, **kwargs)