Source code for profit.sur.gp.backend.gp_functions

"""
Collection of functions for the Custom GPSurrogate.
"""

import numpy as np


[docs]def optimize(
    xtrain,
    ytrain,
    a0,
    kernel,
    fixed_sigma_n=False,
    eval_gradient=False,
    return_hess_inv=False,
):
    r"""Finds optimal hyperparameters from initial array a0, sorted as [length_scale, scale, noise].

    The objective function, which is minimized, is the negative log likelihood.
    The hyperparameters are transformed logarithmically before the optimization to enhance its flexibility. They are
    transformed back afterwards.

    Parameters:
        xtrain (ndarray):
        ytrain (ndarray):
        a0 (ndarray): Flat array of initial hyperparameters.
        kernel (function): Function of the kernel, not the actual matrix.
        fixed_sigma_n (bool): If the noise $\sigma_n$ should be fixed during optimization.
        eval_gradient (bool): Whether the analytic derivatives of the kernel and likelihood are used in the
            optimization.
        return_hess_inv (bool): If True, returns the inverse Hessian matrix from the optimization result.
            This is important for advanced active learning.

    Returns:
        tuple: A tuple containing:
            - opt_hyperparameters (ndarray): Flat array of optimized hyperparameters.
            - hess_inv (scipy.optimize.LbfgsInvHessProduct): Inverse Hessian matrix in the form of a scipy linear
            operator.
            If return_hess_inv is False, only the optimized hyperparameters are returned.
    """

    from scipy.optimize import minimize

    if fixed_sigma_n:
        sigma_n = a0[-1]
        a0 = a0[:-1]
    else:
        sigma_n = None

    a0 = np.log10(a0)

    # Additional arguments for the negative_log_likelihood function.
    args = [xtrain, ytrain, kernel, eval_gradient, True]

    # If sigma_n should be kept fixed during optimization.
    if sigma_n is not None:
        args.append(sigma_n)

    opt_result = minimize(
        negative_log_likelihood,
        a0,
        method="bfgs",
        args=tuple(args),
        bounds=None,
        jac=eval_gradient,
    )
    if return_hess_inv:
        return 10**opt_result.x, opt_result.hess_inv
    return 10**opt_result.x


[docs]def solve_cholesky(L, b):
    r"""Solves a linear equation with a lower triangular matrix L from the Cholesky decomposition.

    In the context of GP's the $L$ is the Cholesky decomposition of the training kernel matrix and $b$ is the training
    output data.

    $$
    \begin{equation}
    \alpha = L^T L b
    \end{equation}
    $$

    Parameters:
        L (ndarray): Lower triangular matrix.
        b (ndarray): A vector.

    Returns:
        ndarray
    """

    from scipy.linalg import solve_triangular

    alpha = solve_triangular(
        L.T,
        solve_triangular(L, b, lower=True, check_finite=False),
        lower=False,
        check_finite=False,
    )
    return alpha


[docs]def negative_log_likelihood_cholesky(
    hyp, X, y, kernel, eval_gradient=False, log_scale_hyp=False, fixed_sigma_n=False
):
    r"""Computes the negative log likelihood using the Cholesky decomposition of the covariance matrix.

    The calculation follows Rasmussen&Williams 2006, p. 19, 113-114.

    $$
    \begin{align}
    NL &= \frac{1}{2} y^T \alpha + tr(\log(L)) + \frac{n}{2} \log(2 \pi) \\
    \frac{dNL}{d\theta} &= \frac{1}{2} tr\left( (K_y^{-1} - \alpha \alpha^T) \frac{\partial K}{\partial \theta} \right) \\
    \alpha &= K_y^{-1} y
    \end{align}
    $$

    Parameters:
        hyp (ndarray): Flat hyperparameter array [length_scale, sigma_f, sigma_n]. They can also be already
            log transformed.
        X (ndarray): Training input points.
        y (ndarray): Observed training output.
        kernel (function): Function to build the covariance matrix.
        eval_gradient (bool): If the analytic gradient of the negative log likelihood w.r.t. the hyperparameters should
            be returned.
        log_scale_hyp (bool): Whether the hyperparameters are log transformed. This is important for the gradient
            calculation.
        fixed_sigma_n (bool): If the noise $\sigma_n$ is kept fixed. In this case, there is no gradient with respect to
            sigma_n.

    Returns:
        tuple: A tuple containing:
            - nll (float): The negative log likelihood.
            - dnll (ndarray): The derivative of the negative log likelihood w.r.t. to the hyperparameters.
            If eval_gradient is False, only nll is returned.
    """

    Ky = kernel(X, X, hyp[:-2], *hyp[-2:], eval_gradient=eval_gradient)
    if eval_gradient:
        dKy = Ky[1]
        Ky = Ky[0]

    L = np.linalg.cholesky(Ky)
    alpha = solve_cholesky(L, y)
    nll = (
        0.5 * y.T @ alpha
        + np.sum(np.log(L.diagonal()))
        + len(X) * 0.5 * np.log(2.0 * np.pi)
    )
    if not eval_gradient:
        return nll.item()
    KyinvaaT = invert_cholesky(L)
    KyinvaaT -= np.outer(alpha, alpha)
    dnll = 0.5 * np.trace(KyinvaaT @ dKy)  # Rasmussen&Williams p. 114, eq. 5.9
    if fixed_sigma_n:
        dnll = dnll[:-1]
        hyp = hyp[:-1]
    if log_scale_hyp:
        # Chain rule
        dnll *= hyp * np.log(10)
    return nll.item(), dnll


[docs]def negative_log_likelihood(
    hyp,
    X,
    y,
    kernel,
    eval_gradient=False,
    log_scale_hyp=False,
    fixed_sigma_n_value=None,
    neig=0,
    max_iter=1000,
):
    r"""Computes the negative log likelihood either by a Cholesky- or an Eigendecomposition.

    First, the Cholesky decomposition is tried. If this results in a LinAlgError, the biggest eigenvalues are
    calculated until convergence or until the maximum iterations are reached.
    The eigenvalues are cut off at $1e-10$ due ensure numerical stability.

    $$
    \begin{align}
    NL &= \frac{1}{2} \left( y^T \alpha + tr(\log(\lambda)) + n_{eig} \log(2 \pi) \right) \\
    \frac{dNL}{d\theta} &= \frac{1}{2} tr\left( (K_y^{-1} - \alpha \alpha^T) \frac{\partial K}{\partial \theta} \right) \\
    \alpha &= v (\lambda^{-1} (v^T y))
    \end{align}
    $$

    Parameters:
        hyp (ndarray): Flat hyperparameter array [length_scale, sigma_f, sigma_n].
        X (ndarray): Training input points.
        y (ndarray): Observed training output.
        kernel (function): Function to build the covariance matrix.
        eval_gradient (bool): If the analytic gradient of the negative log likelihood w.r.t. the hyperparameters should
            be returned.
        log_scale_hyp (bool): Whether the hyperparameters are log transformed. This is important for the gradient
            calculation.
        fixed_sigma_n_value (float): The value of the fixed noise $\sigma_n$. If it should be optimizied as well, this
            should be None.
        neig (int): Initial number of eigenvalues to calculate if the Cholesky decomposition is not successful.
            This is doubled during every iteration.
        max_iter (int): Maximum number of iterations of the eigenvalue solver until convergence must be reached.

    Returns:
        tuple: A tuple containing:
            - nll (float): The negative log likelihood.
            - dnll (ndarray): The derivative of the negative log likelihood w.r.t. to the hyperparameters.
            If eval_gradient is False, only nll is returned.
    """

    from scipy.sparse.linalg import eigsh

    if log_scale_hyp:
        hyp = 10**hyp
    fixed_sigma_n = fixed_sigma_n_value is not None
    if fixed_sigma_n:
        hyp = np.append(hyp, fixed_sigma_n_value)

    clip_eig = max(1e-3 * min(abs(hyp[:-2])), 1e-10)
    Ky = kernel(
        X, X, hyp[:-2], *hyp[-2:], eval_gradient=eval_gradient
    )  # Construct covariance matrix

    if eval_gradient:
        dKy = Ky[1]
        Ky = Ky[0]

    converged = False
    iteration = 0
    neig = max(neig, 1)
    while not converged:
        if not neig or neig > 0.05 * len(
            X
        ):  # First try with Cholesky decomposition if neig is big
            try:
                return negative_log_likelihood_cholesky(
                    hyp,
                    X,
                    y,
                    kernel,
                    eval_gradient=eval_gradient,
                    log_scale_hyp=log_scale_hyp,
                    fixed_sigma_n=fixed_sigma_n,
                )
            except np.linalg.LinAlgError:
                print("Warning! Fallback to eig solver!")
        w, Q = eigsh(
            Ky, neig, tol=clip_eig
        )  # Otherwise, calculate the first neig eigenvalues and eigenvectors
        if iteration >= max_iter:
            print("Reached max. iterations!")
            break
        neig *= 2  # Calculate more eigenvalues
        converged = w[0] <= clip_eig or neig >= len(X)

    # Calculate the NLL with these eigenvalues and eigenvectors
    w = np.maximum(w, 1e-10)
    alpha = Q @ (np.diag(1.0 / w) @ (Q.T @ y))
    nll = 0.5 * (
        y.T @ alpha + np.sum(np.log(w)) + min(neig, len(X)) * np.log(2 * np.pi)
    )
    if not eval_gradient:
        return nll.item()

    KyinvaaT = invert(Ky)
    KyinvaaT -= np.outer(alpha, alpha)

    dnll = 0.5 * np.trace(KyinvaaT @ dKy)  # Rasmussen&Williams p. 114, eq. 5.9
    if fixed_sigma_n:
        dnll = dnll[:-1]
        hyp = hyp[:-1]
    if log_scale_hyp:
        # Chain rule
        dnll *= hyp * np.log(10)
    return nll.item(), dnll


[docs]def invert_cholesky(L):
    r"""Inverts a positive-definite matrix based on a Cholesky decomposition.

    This is used to invert the covariance matrix.

    Parameters:
        L (ndarray): Lower triangular matrix from a Cholesky decomposition.

    Returns:
        ndarray: Inverse of the matrix $L^T L$
    """
    from scipy.linalg import solve_triangular

    return solve_triangular(
        L.T,
        solve_triangular(L, np.eye(L.shape[0]), lower=True, check_finite=False),
        lower=False,
        check_finite=False,
    )


[docs]def invert(K, neig=0, tol=1e-10, eps=1e-6, max_iter=1000):
    """Inverts a positive-definite matrix using either a Cholesky- or an Eigendecomposition.

    The solution method depends on the rapidness of decay of the eigenvalues.

    Parameters:
        K (np.ndarray): Kernel matrix.
        neig (int): Initial number of eigenvalues to calculate if the Cholesky decomposition is not successful.
            This is doubled during every iteration.
        tol (float): Convergence criterion for the eigenvalues.
        eps (float): Small number to be added to diagonal of kernel matrix to ensure positive definiteness.
        max_iter (int): Maximum number of iterations of the eigenvalue solver until convergence must be reached.

    Returns:
        np.ndarray: Inverse covariance matrix.
    """
    from scipy.sparse.linalg import eigsh

    try:
        L = np.linalg.cholesky(K)  # Cholesky decomposition of the covariance matrix
    except np.linalg.LinAlgError:
        L = np.linalg.cholesky(K + eps * np.eye(K.shape[0], K.shape[1]))
    if neig <= 0 or neig > 0.05 * len(K):  # First try with Cholesky decomposition
        try:
            return invert_cholesky(L)
        except np.linalg.LinAlgError:
            print("Warning! Fallback to eig solver!")

    # Otherwise, calculate the first neig eigenvalues and eigenvectors
    w, Q = eigsh(K, neig, tol=tol)
    for iteration in range(max_iter):  # Iterate until convergence or max_iter
        if neig > 0.05 * len(K):
            try:
                return invert_cholesky(L)
            except np.linalg.LinAlgError:
                print("Warning! Fallback to eig solver!")
        neig = 2 * neig  # Calculate more eigenvalues
        w, Q = eigsh(K, neig, tol=tol)

        # Convergence criterion
        if np.abs(w[0] - tol) <= tol or neig >= len(K):
            break
    if iteration == max_iter:
        print("Tried maximum number of times.")

    negative_eig = (-1e-2 < w) & (w < 0)
    w[negative_eig] = 1e-2
    K_inv = Q @ (np.diag(1 / w) @ Q.T)
    return K_inv


[docs]def marginal_variance_BBQ(
    Xtrain,
    ytrain,
    Xpred,
    kernel,
    hyperparameters,
    hess_inv,
    fixed_sigma_n=False,
    alpha=None,
    predictive_variance=0,
):
    r"""Calculates the marginal variance to infer the next point in active learning.

    The calculation follows Osborne (2012).

        $\tilde{V}$ ... Marginal covariance matrix
        $\hat{V}$ ... Predictive variance
        $\frac{dm}{d\theta}$ ... Derivative of the predictive mean w.r.t. the hyperparameters
        $H$ ... Hessian matrix

        $$
        \begin{equation}
        \tilde{V} = \left( \frac{dm}{d\theta} \right) H^{-1} \left( \frac{dm}{d\theta} \right)^T
        \end{equation}
        $$

    Parameters:
        Xtrain (ndarray): Input training points.
        ytrain (ndarray): Observed output daat.
        Xpred (ndarray): Possible prediction input points.
        kernel (function): Function to build the covariance matrix.
        hyperparameters (dict): Dictionary of the hyperparameters.
        hess_inv (ndarray): Inverse Hessian matrix.
        fixed_sigma_n (bool): If the noise $\sigma_n$ was fixed during optimization. Then, the Hessian has to be
            padded with zeros.
        alpha (ndarray): If available, the result of $K_y^{-1} y$, else None.
        predictive_variance (ndarray): Predictive variance only. This is added to the marginal variance.

    Returns:
        ndarray: Sum of the actual marginal variance and the predictive variance.
    """

    # TODO: Add full derivatives as in Osborne (2021) and Garnett (2014)
    ordered_hyperparameters = [
        hyperparameters[key] for key in ("length_scale", "sigma_f", "sigma_n")
    ]

    # If no Hessian is available, use only the predictive variance.
    if hess_inv is None:
        return predictive_variance.reshape(-1, 1)

    if fixed_sigma_n:
        padding = np.zeros((len(ordered_hyperparameters), 1))
        hess_inv = np.hstack([np.vstack([hess_inv, padding[:-1].T]), padding])

    # Kernels and their derivatives
    Ky, dKy = kernel(Xtrain, Xtrain, *ordered_hyperparameters, eval_gradient=True)
    Kstar, dKstar = kernel(
        Xpred, Xtrain, *ordered_hyperparameters[:-1], eval_gradient=True
    )
    Kyinv = invert(Ky)
    if alpha is None:
        alpha = Kyinv @ ytrain
    dalpha_dl = -Kyinv @ (dKy[..., 0] @ alpha)
    dalpha_dsigma_f = -Kyinv @ (dKy[..., 1] @ alpha)
    dalpha_dsigma_n = -Kyinv @ (
        2 * hyperparameters["sigma_n"] * np.eye(Xtrain.shape[0]) @ alpha
    )

    dm = np.empty((Xpred.shape[0], len(ordered_hyperparameters), 1))
    dm[:, 0, :] = dKstar[..., 0] @ alpha + Kstar @ dalpha_dl
    dm[:, 1, :] = dKstar[..., 1] @ alpha + Kstar @ dalpha_dsigma_f
    dm[:, 2, :] = Kstar @ dalpha_dsigma_n
    dm = dm.squeeze()

    marginal_variance = dm @ (hess_inv @ dm.T)
    marginal_variance = np.diag(marginal_variance).reshape(-1, 1)

    return marginal_variance + predictive_variance


[docs]def predict_f(hyp, x, y, xtest, kernel, return_full_cov=False, neig=0):
    """Predicts values given only a set of hyperparameters and a kernel.

    This function is independent of the surrogates and used to quickly predict a function output with specific
    hyperparameters.

    The calculation follows Rasmussen&Williams, p. 16, eq. 23-24.

    Parameters:
        hyp (ndarray): Flat array of hyperparameters, sorted as [length_scale, sigma_f, sigma_n].
        x (ndarray): Input training points.
        y (ndarray): Observed output data.
        xtest (ndarray): Prediction points.
        kernel (function): Function to build the covariance matrix.
        return_full_cov (bool): If True, returns the full covariance matrix, otherwise only its diagonal.
        neig (int): Initial number of eigenvalues to be computed during the inversion of the covariance matrix.

    Returns:
        tuple: A tuple containing:
            - fstar (ndarray): Posterior mean.
            - vstar (ndarray): Posterior covariance matrix or its diagonal.
    """

    if len(hyp) < 3:
        hyp[2] = 0  # sigma_n
    Ky = kernel(x, x, hyp[:-2], *hyp[-2:])
    Kstar = kernel(x, xtest, hyp[:-2], hyp[-2])
    Kstarstar = kernel(xtest, xtest, *hyp)
    Kyinv = invert(Ky, neig, 1e-6 * hyp[-1])
    fstar = Kstar.T @ (Kyinv @ y)
    vstar = Kstarstar - Kstar.T @ (invert(Ky) @ Kstar)
    vstar = np.maximum(vstar, 1e-10)  # Assure a positive variance
    if not return_full_cov:
        vstar = np.diag(vstar)
    return fstar, vstar