"""
Collection of functions for the Custom GPSurrogate.
"""
import numpy as np
[docs]def optimize(
xtrain,
ytrain,
a0,
kernel,
fixed_sigma_n=False,
eval_gradient=False,
return_hess_inv=False,
):
r"""Finds optimal hyperparameters from initial array a0, sorted as [length_scale, scale, noise].
The objective function, which is minimized, is the negative log likelihood.
The hyperparameters are transformed logarithmically before the optimization to enhance its flexibility. They are
transformed back afterwards.
Parameters:
xtrain (ndarray):
ytrain (ndarray):
a0 (ndarray): Flat array of initial hyperparameters.
kernel (function): Function of the kernel, not the actual matrix.
fixed_sigma_n (bool): If the noise $\sigma_n$ should be fixed during optimization.
eval_gradient (bool): Whether the analytic derivatives of the kernel and likelihood are used in the
optimization.
return_hess_inv (bool): If True, returns the inverse Hessian matrix from the optimization result.
This is important for advanced active learning.
Returns:
tuple: A tuple containing:
- opt_hyperparameters (ndarray): Flat array of optimized hyperparameters.
- hess_inv (scipy.optimize.LbfgsInvHessProduct): Inverse Hessian matrix in the form of a scipy linear
operator.
If return_hess_inv is False, only the optimized hyperparameters are returned.
"""
from scipy.optimize import minimize
if fixed_sigma_n:
sigma_n = a0[-1]
a0 = a0[:-1]
else:
sigma_n = None
a0 = np.log10(a0)
# Additional arguments for the negative_log_likelihood function.
args = [xtrain, ytrain, kernel, eval_gradient, True]
# If sigma_n should be kept fixed during optimization.
if sigma_n is not None:
args.append(sigma_n)
opt_result = minimize(
negative_log_likelihood,
a0,
method="bfgs",
args=tuple(args),
bounds=None,
jac=eval_gradient,
)
if return_hess_inv:
return 10**opt_result.x, opt_result.hess_inv
return 10**opt_result.x
[docs]def solve_cholesky(L, b):
r"""Solves a linear equation with a lower triangular matrix L from the Cholesky decomposition.
In the context of GP's the $L$ is the Cholesky decomposition of the training kernel matrix and $b$ is the training
output data.
$$
\begin{equation}
\alpha = L^T L b
\end{equation}
$$
Parameters:
L (ndarray): Lower triangular matrix.
b (ndarray): A vector.
Returns:
ndarray
"""
from scipy.linalg import solve_triangular
alpha = solve_triangular(
L.T,
solve_triangular(L, b, lower=True, check_finite=False),
lower=False,
check_finite=False,
)
return alpha
[docs]def negative_log_likelihood_cholesky(
hyp, X, y, kernel, eval_gradient=False, log_scale_hyp=False, fixed_sigma_n=False
):
r"""Computes the negative log likelihood using the Cholesky decomposition of the covariance matrix.
The calculation follows Rasmussen&Williams 2006, p. 19, 113-114.
$$
\begin{align}
NL &= \frac{1}{2} y^T \alpha + tr(\log(L)) + \frac{n}{2} \log(2 \pi) \\
\frac{dNL}{d\theta} &= \frac{1}{2} tr\left( (K_y^{-1} - \alpha \alpha^T) \frac{\partial K}{\partial \theta} \right) \\
\alpha &= K_y^{-1} y
\end{align}
$$
Parameters:
hyp (ndarray): Flat hyperparameter array [length_scale, sigma_f, sigma_n]. They can also be already
log transformed.
X (ndarray): Training input points.
y (ndarray): Observed training output.
kernel (function): Function to build the covariance matrix.
eval_gradient (bool): If the analytic gradient of the negative log likelihood w.r.t. the hyperparameters should
be returned.
log_scale_hyp (bool): Whether the hyperparameters are log transformed. This is important for the gradient
calculation.
fixed_sigma_n (bool): If the noise $\sigma_n$ is kept fixed. In this case, there is no gradient with respect to
sigma_n.
Returns:
tuple: A tuple containing:
- nll (float): The negative log likelihood.
- dnll (ndarray): The derivative of the negative log likelihood w.r.t. to the hyperparameters.
If eval_gradient is False, only nll is returned.
"""
Ky = kernel(X, X, hyp[:-2], *hyp[-2:], eval_gradient=eval_gradient)
if eval_gradient:
dKy = Ky[1]
Ky = Ky[0]
L = np.linalg.cholesky(Ky)
alpha = solve_cholesky(L, y)
nll = (
0.5 * y.T @ alpha
+ np.sum(np.log(L.diagonal()))
+ len(X) * 0.5 * np.log(2.0 * np.pi)
)
if not eval_gradient:
return nll.item()
KyinvaaT = invert_cholesky(L)
KyinvaaT -= np.outer(alpha, alpha)
dnll = 0.5 * np.trace(KyinvaaT @ dKy) # Rasmussen&Williams p. 114, eq. 5.9
if fixed_sigma_n:
dnll = dnll[:-1]
hyp = hyp[:-1]
if log_scale_hyp:
# Chain rule
dnll *= hyp * np.log(10)
return nll.item(), dnll
[docs]def negative_log_likelihood(
hyp,
X,
y,
kernel,
eval_gradient=False,
log_scale_hyp=False,
fixed_sigma_n_value=None,
neig=0,
max_iter=1000,
):
r"""Computes the negative log likelihood either by a Cholesky- or an Eigendecomposition.
First, the Cholesky decomposition is tried. If this results in a LinAlgError, the biggest eigenvalues are
calculated until convergence or until the maximum iterations are reached.
The eigenvalues are cut off at $1e-10$ due ensure numerical stability.
$$
\begin{align}
NL &= \frac{1}{2} \left( y^T \alpha + tr(\log(\lambda)) + n_{eig} \log(2 \pi) \right) \\
\frac{dNL}{d\theta} &= \frac{1}{2} tr\left( (K_y^{-1} - \alpha \alpha^T) \frac{\partial K}{\partial \theta} \right) \\
\alpha &= v (\lambda^{-1} (v^T y))
\end{align}
$$
Parameters:
hyp (ndarray): Flat hyperparameter array [length_scale, sigma_f, sigma_n].
X (ndarray): Training input points.
y (ndarray): Observed training output.
kernel (function): Function to build the covariance matrix.
eval_gradient (bool): If the analytic gradient of the negative log likelihood w.r.t. the hyperparameters should
be returned.
log_scale_hyp (bool): Whether the hyperparameters are log transformed. This is important for the gradient
calculation.
fixed_sigma_n_value (float): The value of the fixed noise $\sigma_n$. If it should be optimizied as well, this
should be None.
neig (int): Initial number of eigenvalues to calculate if the Cholesky decomposition is not successful.
This is doubled during every iteration.
max_iter (int): Maximum number of iterations of the eigenvalue solver until convergence must be reached.
Returns:
tuple: A tuple containing:
- nll (float): The negative log likelihood.
- dnll (ndarray): The derivative of the negative log likelihood w.r.t. to the hyperparameters.
If eval_gradient is False, only nll is returned.
"""
from scipy.sparse.linalg import eigsh
if log_scale_hyp:
hyp = 10**hyp
fixed_sigma_n = fixed_sigma_n_value is not None
if fixed_sigma_n:
hyp = np.append(hyp, fixed_sigma_n_value)
clip_eig = max(1e-3 * min(abs(hyp[:-2])), 1e-10)
Ky = kernel(
X, X, hyp[:-2], *hyp[-2:], eval_gradient=eval_gradient
) # Construct covariance matrix
if eval_gradient:
dKy = Ky[1]
Ky = Ky[0]
converged = False
iteration = 0
neig = max(neig, 1)
while not converged:
if not neig or neig > 0.05 * len(
X
): # First try with Cholesky decomposition if neig is big
try:
return negative_log_likelihood_cholesky(
hyp,
X,
y,
kernel,
eval_gradient=eval_gradient,
log_scale_hyp=log_scale_hyp,
fixed_sigma_n=fixed_sigma_n,
)
except np.linalg.LinAlgError:
print("Warning! Fallback to eig solver!")
w, Q = eigsh(
Ky, neig, tol=clip_eig
) # Otherwise, calculate the first neig eigenvalues and eigenvectors
if iteration >= max_iter:
print("Reached max. iterations!")
break
neig *= 2 # Calculate more eigenvalues
converged = w[0] <= clip_eig or neig >= len(X)
# Calculate the NLL with these eigenvalues and eigenvectors
w = np.maximum(w, 1e-10)
alpha = Q @ (np.diag(1.0 / w) @ (Q.T @ y))
nll = 0.5 * (
y.T @ alpha + np.sum(np.log(w)) + min(neig, len(X)) * np.log(2 * np.pi)
)
if not eval_gradient:
return nll.item()
KyinvaaT = invert(Ky)
KyinvaaT -= np.outer(alpha, alpha)
dnll = 0.5 * np.trace(KyinvaaT @ dKy) # Rasmussen&Williams p. 114, eq. 5.9
if fixed_sigma_n:
dnll = dnll[:-1]
hyp = hyp[:-1]
if log_scale_hyp:
# Chain rule
dnll *= hyp * np.log(10)
return nll.item(), dnll
[docs]def invert_cholesky(L):
r"""Inverts a positive-definite matrix based on a Cholesky decomposition.
This is used to invert the covariance matrix.
Parameters:
L (ndarray): Lower triangular matrix from a Cholesky decomposition.
Returns:
ndarray: Inverse of the matrix $L^T L$
"""
from scipy.linalg import solve_triangular
return solve_triangular(
L.T,
solve_triangular(L, np.eye(L.shape[0]), lower=True, check_finite=False),
lower=False,
check_finite=False,
)
[docs]def invert(K, neig=0, tol=1e-10, eps=1e-6, max_iter=1000):
"""Inverts a positive-definite matrix using either a Cholesky- or an Eigendecomposition.
The solution method depends on the rapidness of decay of the eigenvalues.
Parameters:
K (np.ndarray): Kernel matrix.
neig (int): Initial number of eigenvalues to calculate if the Cholesky decomposition is not successful.
This is doubled during every iteration.
tol (float): Convergence criterion for the eigenvalues.
eps (float): Small number to be added to diagonal of kernel matrix to ensure positive definiteness.
max_iter (int): Maximum number of iterations of the eigenvalue solver until convergence must be reached.
Returns:
np.ndarray: Inverse covariance matrix.
"""
from scipy.sparse.linalg import eigsh
try:
L = np.linalg.cholesky(K) # Cholesky decomposition of the covariance matrix
except np.linalg.LinAlgError:
L = np.linalg.cholesky(K + eps * np.eye(K.shape[0], K.shape[1]))
if neig <= 0 or neig > 0.05 * len(K): # First try with Cholesky decomposition
try:
return invert_cholesky(L)
except np.linalg.LinAlgError:
print("Warning! Fallback to eig solver!")
# Otherwise, calculate the first neig eigenvalues and eigenvectors
w, Q = eigsh(K, neig, tol=tol)
for iteration in range(max_iter): # Iterate until convergence or max_iter
if neig > 0.05 * len(K):
try:
return invert_cholesky(L)
except np.linalg.LinAlgError:
print("Warning! Fallback to eig solver!")
neig = 2 * neig # Calculate more eigenvalues
w, Q = eigsh(K, neig, tol=tol)
# Convergence criterion
if np.abs(w[0] - tol) <= tol or neig >= len(K):
break
if iteration == max_iter:
print("Tried maximum number of times.")
negative_eig = (-1e-2 < w) & (w < 0)
w[negative_eig] = 1e-2
K_inv = Q @ (np.diag(1 / w) @ Q.T)
return K_inv
[docs]def marginal_variance_BBQ(
Xtrain,
ytrain,
Xpred,
kernel,
hyperparameters,
hess_inv,
fixed_sigma_n=False,
alpha=None,
predictive_variance=0,
):
r"""Calculates the marginal variance to infer the next point in active learning.
The calculation follows Osborne (2012).
$\tilde{V}$ ... Marginal covariance matrix
$\hat{V}$ ... Predictive variance
$\frac{dm}{d\theta}$ ... Derivative of the predictive mean w.r.t. the hyperparameters
$H$ ... Hessian matrix
$$
\begin{equation}
\tilde{V} = \left( \frac{dm}{d\theta} \right) H^{-1} \left( \frac{dm}{d\theta} \right)^T
\end{equation}
$$
Parameters:
Xtrain (ndarray): Input training points.
ytrain (ndarray): Observed output daat.
Xpred (ndarray): Possible prediction input points.
kernel (function): Function to build the covariance matrix.
hyperparameters (dict): Dictionary of the hyperparameters.
hess_inv (ndarray): Inverse Hessian matrix.
fixed_sigma_n (bool): If the noise $\sigma_n$ was fixed during optimization. Then, the Hessian has to be
padded with zeros.
alpha (ndarray): If available, the result of $K_y^{-1} y$, else None.
predictive_variance (ndarray): Predictive variance only. This is added to the marginal variance.
Returns:
ndarray: Sum of the actual marginal variance and the predictive variance.
"""
# TODO: Add full derivatives as in Osborne (2021) and Garnett (2014)
ordered_hyperparameters = [
hyperparameters[key] for key in ("length_scale", "sigma_f", "sigma_n")
]
# If no Hessian is available, use only the predictive variance.
if hess_inv is None:
return predictive_variance.reshape(-1, 1)
if fixed_sigma_n:
padding = np.zeros((len(ordered_hyperparameters), 1))
hess_inv = np.hstack([np.vstack([hess_inv, padding[:-1].T]), padding])
# Kernels and their derivatives
Ky, dKy = kernel(Xtrain, Xtrain, *ordered_hyperparameters, eval_gradient=True)
Kstar, dKstar = kernel(
Xpred, Xtrain, *ordered_hyperparameters[:-1], eval_gradient=True
)
Kyinv = invert(Ky)
if alpha is None:
alpha = Kyinv @ ytrain
dalpha_dl = -Kyinv @ (dKy[..., 0] @ alpha)
dalpha_dsigma_f = -Kyinv @ (dKy[..., 1] @ alpha)
dalpha_dsigma_n = -Kyinv @ (
2 * hyperparameters["sigma_n"] * np.eye(Xtrain.shape[0]) @ alpha
)
dm = np.empty((Xpred.shape[0], len(ordered_hyperparameters), 1))
dm[:, 0, :] = dKstar[..., 0] @ alpha + Kstar @ dalpha_dl
dm[:, 1, :] = dKstar[..., 1] @ alpha + Kstar @ dalpha_dsigma_f
dm[:, 2, :] = Kstar @ dalpha_dsigma_n
dm = dm.squeeze()
marginal_variance = dm @ (hess_inv @ dm.T)
marginal_variance = np.diag(marginal_variance).reshape(-1, 1)
return marginal_variance + predictive_variance
[docs]def predict_f(hyp, x, y, xtest, kernel, return_full_cov=False, neig=0):
"""Predicts values given only a set of hyperparameters and a kernel.
This function is independent of the surrogates and used to quickly predict a function output with specific
hyperparameters.
The calculation follows Rasmussen&Williams, p. 16, eq. 23-24.
Parameters:
hyp (ndarray): Flat array of hyperparameters, sorted as [length_scale, sigma_f, sigma_n].
x (ndarray): Input training points.
y (ndarray): Observed output data.
xtest (ndarray): Prediction points.
kernel (function): Function to build the covariance matrix.
return_full_cov (bool): If True, returns the full covariance matrix, otherwise only its diagonal.
neig (int): Initial number of eigenvalues to be computed during the inversion of the covariance matrix.
Returns:
tuple: A tuple containing:
- fstar (ndarray): Posterior mean.
- vstar (ndarray): Posterior covariance matrix or its diagonal.
"""
if len(hyp) < 3:
hyp[2] = 0 # sigma_n
Ky = kernel(x, x, hyp[:-2], *hyp[-2:])
Kstar = kernel(x, xtest, hyp[:-2], hyp[-2])
Kstarstar = kernel(xtest, xtest, *hyp)
Kyinv = invert(Ky, neig, 1e-6 * hyp[-1])
fstar = Kstar.T @ (Kyinv @ y)
vstar = Kstarstar - Kstar.T @ (invert(Ky) @ Kstar)
vstar = np.maximum(vstar, 1e-10) # Assure a positive variance
if not return_full_cov:
vstar = np.diag(vstar)
return fstar, vstar