mirror of
https://github.com/SheffieldML/GPy.git
synced 2026-05-30 14:35:15 +02:00
some documenting, and fiddling with the laplace approx
This commit is contained in:
parent
9f40ab0f83
commit
399adb1b00
3 changed files with 86 additions and 156 deletions
|
|
@ -53,6 +53,6 @@ class ExactGaussianInference(object):
|
||||||
|
|
||||||
likelihood.update_gradients(np.diag(dL_dK))
|
likelihood.update_gradients(np.diag(dL_dK))
|
||||||
|
|
||||||
return Posterior(LW, alpha, K), log_marginal, {'dL_dK':dL_dK}
|
return Posterior(woodbury_chol=LW, woodbury_vector=alpha, K=K), log_marginal, {'dL_dK':dL_dK}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -15,49 +15,32 @@ import scipy as sp
|
||||||
from likelihood import likelihood
|
from likelihood import likelihood
|
||||||
from ..util.linalg import mdot, jitchol, pddet, dpotrs
|
from ..util.linalg import mdot, jitchol, pddet, dpotrs
|
||||||
from functools import partial as partial_func
|
from functools import partial as partial_func
|
||||||
|
from posterior import Posterior
|
||||||
import warnings
|
import warnings
|
||||||
|
|
||||||
class LaplaceInference(object):
|
class LaplaceInference(object):
|
||||||
"""Laplace approximation to a posterior"""
|
"""Laplace approximation to a posterior"""
|
||||||
|
|
||||||
def __init__(self, data, noise_model, extra_data=None):
|
def __init__(self):
|
||||||
"""
|
"""
|
||||||
Laplace Approximation
|
Laplace Approximation
|
||||||
|
|
||||||
Find the moments \hat{f} and the hessian at this point
|
Find the moments \hat{f} and the hessian at this point
|
||||||
(using Newton-Raphson) of the unnormalised posterior
|
(using Newton-Raphson) of the unnormalised posterior
|
||||||
|
|
||||||
Compute the GP variables (i.e. generate some Y^{squiggle} and
|
|
||||||
z^{squiggle} which makes a gaussian the same as the laplace
|
|
||||||
approximation to the posterior, but normalised
|
|
||||||
|
|
||||||
Arguments
|
|
||||||
---------
|
|
||||||
|
|
||||||
:param data: array of data the likelihood function is approximating
|
|
||||||
:type data: NxD
|
|
||||||
:param noise_model: likelihood function - subclass of noise_model
|
|
||||||
:type noise_model: noise_model
|
|
||||||
:param extra_data: additional data used by some likelihood functions,
|
|
||||||
"""
|
"""
|
||||||
self.data = data
|
|
||||||
self.noise_model = noise_model
|
|
||||||
self.extra_data = extra_data
|
|
||||||
|
|
||||||
#Inital values
|
#Inital values
|
||||||
self.N, self.D = self.data.shape
|
|
||||||
self.is_heteroscedastic = True
|
|
||||||
self.Nparams = 0
|
|
||||||
self.NORMAL_CONST = ((0.5 * self.N) * np.log(2 * np.pi))
|
self.NORMAL_CONST = ((0.5 * self.N) * np.log(2 * np.pi))
|
||||||
|
|
||||||
self.restart()
|
|
||||||
likelihood.__init__(self)
|
|
||||||
|
|
||||||
def inference(self, kern, X, likelihood, Y, Y_metadata=None):
|
def inference(self, kern, X, likelihood, Y, Y_metadata=None):
|
||||||
"""
|
"""
|
||||||
Returns a Posterior class containing essential quantities of the posterior
|
Returns a Posterior class containing essential quantities of the posterior
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
self.N, self.D = self.data.shape
|
||||||
|
self.restart()
|
||||||
|
|
||||||
# Compute K
|
# Compute K
|
||||||
self.K = kern.K(X)
|
self.K = kern.K(X)
|
||||||
self.data = Y
|
self.data = Y
|
||||||
|
|
@ -69,10 +52,11 @@ class LaplaceInference(object):
|
||||||
#Compute hessian and other variables at mode
|
#Compute hessian and other variables at mode
|
||||||
self._compute_likelihood_variables()
|
self._compute_likelihood_variables()
|
||||||
|
|
||||||
#Compute fake variables replicating laplace approximation to posterior
|
likelihood.gradient = self.likelihood_gradients()
|
||||||
self._compute_GP_variables()
|
dL_dK = self._Kgradients()
|
||||||
|
kern.update_gradients_full(dL_dK)
|
||||||
|
|
||||||
return Posterior(mean=self.f_hat, cov=self.covariance_matrix, K=self.K)
|
return Posterior(mean=self.f_hat, cov=self.Sigma, K=self.K), log_marginal_approx, {'dL_dK':dL_dK}
|
||||||
|
|
||||||
def restart(self):
|
def restart(self):
|
||||||
"""
|
"""
|
||||||
|
|
@ -88,37 +72,10 @@ class LaplaceInference(object):
|
||||||
self.old_Ki_f = None
|
self.old_Ki_f = None
|
||||||
self.bad_fhat = False
|
self.bad_fhat = False
|
||||||
|
|
||||||
def predictive_values(self,mu,var,full_cov,**noise_args):
|
|
||||||
if full_cov:
|
|
||||||
raise NotImplementedError, "Cannot make correlated predictions with an EP likelihood"
|
|
||||||
return self.noise_model.predictive_values(mu,var,**noise_args)
|
|
||||||
|
|
||||||
def log_predictive_density(self, y_test, mu_star, var_star):
|
|
||||||
"""
|
|
||||||
Calculation of the log predictive density
|
|
||||||
|
|
||||||
.. math:
|
|
||||||
p(y_{*}|D) = p(y_{*}|f_{*})p(f_{*}|\mu_{*}\\sigma^{2}_{*})
|
|
||||||
|
|
||||||
:param y_test: test observations (y_{*})
|
|
||||||
:type y_test: (Nx1) array
|
|
||||||
:param mu_star: predictive mean of gaussian p(f_{*}|mu_{*}, var_{*})
|
|
||||||
:type mu_star: (Nx1) array
|
|
||||||
:param var_star: predictive variance of gaussian p(f_{*}|mu_{*}, var_{*})
|
|
||||||
:type var_star: (Nx1) array
|
|
||||||
"""
|
|
||||||
return self.noise_model.log_predictive_density(y_test, mu_star, var_star)
|
|
||||||
|
|
||||||
def _get_params(self):
|
|
||||||
return np.asarray(self.noise_model._get_params())
|
|
||||||
|
|
||||||
def _get_param_names(self):
|
|
||||||
return self.noise_model._get_param_names()
|
|
||||||
|
|
||||||
def _set_params(self, p):
|
|
||||||
return self.noise_model._set_params(p)
|
|
||||||
|
|
||||||
def _shared_gradients_components(self):
|
def _shared_gradients_components(self):
|
||||||
|
"""
|
||||||
|
A helper function to compute some common quantities
|
||||||
|
"""
|
||||||
d3lik_d3fhat = self.noise_model.d3logpdf_df3(self.f_hat, self.data, extra_data=self.extra_data)
|
d3lik_d3fhat = self.noise_model.d3logpdf_df3(self.f_hat, self.data, extra_data=self.extra_data)
|
||||||
dL_dfhat = 0.5*(np.diag(self.Ki_W_i)[:, None]*d3lik_d3fhat).T #why isn't this -0.5?
|
dL_dfhat = 0.5*(np.diag(self.Ki_W_i)[:, None]*d3lik_d3fhat).T #why isn't this -0.5?
|
||||||
I_KW_i = np.eye(self.N) - np.dot(self.K, self.Wi_K_i)
|
I_KW_i = np.eye(self.N) - np.dot(self.K, self.Wi_K_i)
|
||||||
|
|
@ -132,41 +89,30 @@ class LaplaceInference(object):
|
||||||
:rtype: Matrix (1 x num_kernel_params)
|
:rtype: Matrix (1 x num_kernel_params)
|
||||||
"""
|
"""
|
||||||
dL_dfhat, I_KW_i = self._shared_gradients_components()
|
dL_dfhat, I_KW_i = self._shared_gradients_components()
|
||||||
dlp = self.noise_model.dlogpdf_df(self.f_hat, self.data, extra_data=self.extra_data)
|
dlp = likelihood.dlogpdf_df(self.f_hat, Y, extra_data=None) # TODO: how will extra data work?
|
||||||
|
|
||||||
#Explicit
|
#Explicit
|
||||||
#expl_a = np.dot(self.Ki_f, self.Ki_f.T)
|
expl_a = np.dot(self.Ki_f, self.Ki_f.T)
|
||||||
#expl_b = self.Wi_K_i
|
expl_b = self.Wi_K_i
|
||||||
#expl = 0.5*expl_a - 0.5*expl_b
|
expl = 0.5*expl_a - 0.5*expl_b
|
||||||
#dL_dthetaK_exp = dK_dthetaK(expl, X)
|
dL_dthetaK_exp = dK_dthetaK(expl, X)
|
||||||
|
|
||||||
#Implicit
|
#Implicit
|
||||||
impl = mdot(dlp, dL_dfhat, I_KW_i)
|
impl = mdot(dlp, dL_dfhat, I_KW_i)
|
||||||
|
|
||||||
#No longer required as we are computing these in the gp already
|
dL_dK = expl + impl
|
||||||
#otherwise we would take them away and add them back
|
|
||||||
#dL_dthetaK_imp = dK_dthetaK(impl, X)
|
|
||||||
#dL_dthetaK = dL_dthetaK_exp + dL_dthetaK_imp
|
|
||||||
#dL_dK = expl + impl
|
|
||||||
|
|
||||||
#No need to compute explicit as we are computing dZ_dK to account
|
|
||||||
#for the difference between the K gradients of a normal GP,
|
|
||||||
#and the K gradients including the implicit part
|
|
||||||
dL_dK = impl
|
|
||||||
return dL_dK
|
return dL_dK
|
||||||
|
|
||||||
def _gradients(self, partial):
|
def likelihood_gradients(self):
|
||||||
"""
|
"""
|
||||||
Gradients with respect to likelihood parameters (dL_dthetaL)
|
Gradients with respect to likelihood parameters (dL_dthetaL)
|
||||||
|
|
||||||
:param partial: Not needed by this likelihood
|
|
||||||
:type partial: lambda function
|
|
||||||
:rtype: array of derivatives (1 x num_likelihood_params)
|
:rtype: array of derivatives (1 x num_likelihood_params)
|
||||||
"""
|
"""
|
||||||
dL_dfhat, I_KW_i = self._shared_gradients_components()
|
dL_dfhat, I_KW_i = self._shared_gradients_components()
|
||||||
dlik_dthetaL, dlik_grad_dthetaL, dlik_hess_dthetaL = self.noise_model._laplace_gradients(self.f_hat, self.data, extra_data=self.extra_data)
|
dlik_dthetaL, dlik_grad_dthetaL, dlik_hess_dthetaL = self.noise_model._laplace_gradients(self.f_hat, self.data, extra_data=self.extra_data)
|
||||||
|
|
||||||
#len(dlik_dthetaL)
|
|
||||||
num_params = len(self._get_param_names())
|
num_params = len(self._get_param_names())
|
||||||
# make space for one derivative for each likelihood parameter
|
# make space for one derivative for each likelihood parameter
|
||||||
dL_dthetaL = np.zeros(num_params)
|
dL_dthetaL = np.zeros(num_params)
|
||||||
|
|
@ -184,88 +130,9 @@ class LaplaceInference(object):
|
||||||
|
|
||||||
return dL_dthetaL
|
return dL_dthetaL
|
||||||
|
|
||||||
def _compute_GP_variables(self):
|
|
||||||
"""
|
|
||||||
Generate data Y which would give the normal distribution identical
|
|
||||||
to the laplace approximation to the posterior, but normalised
|
|
||||||
|
|
||||||
GPy expects a likelihood to be gaussian, so need to caluclate
|
|
||||||
the data Y^{\tilde} that makes the posterior match that found
|
|
||||||
by a laplace approximation to a non-gaussian likelihood but with
|
|
||||||
a gaussian likelihood
|
|
||||||
|
|
||||||
Firstly,
|
|
||||||
The hessian of the unormalised posterior distribution is (K^{-1} + W)^{-1},
|
|
||||||
i.e. z*N(f|f^{\hat}, (K^{-1} + W)^{-1}) but this assumes a non-gaussian likelihood,
|
|
||||||
we wish to find the hessian \Sigma^{\tilde}
|
|
||||||
that has the same curvature but using our new simulated data Y^{\tilde}
|
|
||||||
i.e. we do N(Y^{\tilde}|f^{\hat}, \Sigma^{\tilde})N(f|0, K) = z*N(f|f^{\hat}, (K^{-1} + W)^{-1})
|
|
||||||
and we wish to find what Y^{\tilde} and \Sigma^{\tilde}
|
|
||||||
We find that Y^{\tilde} = W^{-1}(K^{-1} + W)f^{\hat} and \Sigma^{tilde} = W^{-1}
|
|
||||||
|
|
||||||
Secondly,
|
|
||||||
GPy optimizes the log marginal log p(y) = -0.5*ln|K+\Sigma^{\tilde}| - 0.5*Y^{\tilde}^{T}(K^{-1} + \Sigma^{tilde})^{-1}Y + lik.Z
|
|
||||||
So we can suck up any differences between that and our log marginal likelihood approximation
|
|
||||||
p^{\squiggle}(y) = -0.5*f^{\hat}K^{-1}f^{\hat} + log p(y|f^{\hat}) - 0.5*log |K||K^{-1} + W|
|
|
||||||
which we want to optimize instead, by equating them and rearranging, the difference is added onto
|
|
||||||
the log p(y) that GPy optimizes by default
|
|
||||||
|
|
||||||
Thirdly,
|
|
||||||
Since we have gradients that depend on how we move f^{\hat}, we have implicit components
|
|
||||||
aswell as the explicit dL_dK, we hold these differences in dZ_dK and add them to dL_dK in the
|
|
||||||
gp.py code
|
|
||||||
"""
|
|
||||||
Wi = 1.0/self.W
|
|
||||||
self.Sigma_tilde = np.diagflat(Wi)
|
|
||||||
|
|
||||||
Y_tilde = Wi*self.Ki_f + self.f_hat
|
|
||||||
|
|
||||||
self.Wi_K_i = self.W12BiW12
|
|
||||||
ln_det_Wi_K = pddet(self.Sigma_tilde + self.K)
|
|
||||||
lik = self.noise_model.logpdf(self.f_hat, self.data, extra_data=self.extra_data)
|
|
||||||
y_Wi_K_i_y = mdot(Y_tilde.T, self.Wi_K_i, Y_tilde)
|
|
||||||
|
|
||||||
Z_tilde = (+ lik
|
|
||||||
- 0.5*self.ln_B_det
|
|
||||||
+ 0.5*ln_det_Wi_K
|
|
||||||
- 0.5*self.f_Ki_f
|
|
||||||
+ 0.5*y_Wi_K_i_y
|
|
||||||
+ self.NORMAL_CONST
|
|
||||||
)
|
|
||||||
|
|
||||||
#Convert to float as its (1, 1) and Z must be a scalar
|
|
||||||
self.Z = np.float64(Z_tilde)
|
|
||||||
self.Y = Y_tilde
|
|
||||||
self.YYT = np.dot(self.Y, self.Y.T)
|
|
||||||
self.covariance_matrix = self.Sigma_tilde
|
|
||||||
self.precision = 1.0 / np.diag(self.covariance_matrix)[:, None]
|
|
||||||
|
|
||||||
#Compute dZ_dK which is how the approximated distributions gradients differ from the dL_dK computed for other likelihoods
|
|
||||||
self.dZ_dK = self._Kgradients()
|
|
||||||
#+ 0.5*self.Wi_K_i - 0.5*np.dot(self.Ki_f, self.Ki_f.T) #since we are not adding the K gradients explicit part theres no need to compute this again
|
|
||||||
|
|
||||||
def fit_full(self, K):
|
|
||||||
"""
|
|
||||||
The laplace approximation algorithm, find K and expand hessian
|
|
||||||
For nomenclature see Rasmussen & Williams 2006 - modified for numerical stability
|
|
||||||
|
|
||||||
:param K: Prior covariance matrix evaluated at locations X
|
|
||||||
:type K: NxN matrix
|
|
||||||
"""
|
|
||||||
self.K = K.copy()
|
|
||||||
|
|
||||||
#Find mode
|
|
||||||
self.f_hat = self.rasm_mode(self.K)
|
|
||||||
|
|
||||||
#Compute hessian and other variables at mode
|
|
||||||
self._compute_likelihood_variables()
|
|
||||||
|
|
||||||
#Compute fake variables replicating laplace approximation to posterior
|
|
||||||
self._compute_GP_variables()
|
|
||||||
|
|
||||||
def _compute_likelihood_variables(self):
|
def _compute_likelihood_variables(self):
|
||||||
"""
|
"""
|
||||||
Compute the variables required to compute gaussian Y variables
|
At the mode, compute the hessian and effective covaraince matrix.
|
||||||
"""
|
"""
|
||||||
#At this point get the hessian matrix (or vector as W is diagonal)
|
#At this point get the hessian matrix (or vector as W is diagonal)
|
||||||
self.W = -self.noise_model.d2logpdf_df2(self.f_hat, self.data, extra_data=self.extra_data)
|
self.W = -self.noise_model.d2logpdf_df2(self.f_hat, self.data, extra_data=self.extra_data)
|
||||||
|
|
@ -422,3 +289,65 @@ class LaplaceInference(object):
|
||||||
|
|
||||||
self.Ki_f = Ki_f
|
self.Ki_f = Ki_f
|
||||||
return f
|
return f
|
||||||
|
|
||||||
|
def _compute_GP_variables(self):
|
||||||
|
"""
|
||||||
|
Generate data Y which would give the normal distribution identical
|
||||||
|
to the laplace approximation to the posterior, but normalised
|
||||||
|
|
||||||
|
GPy expects a likelihood to be gaussian, so need to caluclate
|
||||||
|
the data Y^{\tilde} that makes the posterior match that found
|
||||||
|
by a laplace approximation to a non-gaussian likelihood but with
|
||||||
|
a gaussian likelihood
|
||||||
|
|
||||||
|
Firstly,
|
||||||
|
The hessian of the unormalised posterior distribution is (K^{-1} + W)^{-1},
|
||||||
|
i.e. z*N(f|f^{\hat}, (K^{-1} + W)^{-1}) but this assumes a non-gaussian likelihood,
|
||||||
|
we wish to find the hessian \Sigma^{\tilde}
|
||||||
|
that has the same curvature but using our new simulated data Y^{\tilde}
|
||||||
|
i.e. we do N(Y^{\tilde}|f^{\hat}, \Sigma^{\tilde})N(f|0, K) = z*N(f|f^{\hat}, (K^{-1} + W)^{-1})
|
||||||
|
and we wish to find what Y^{\tilde} and \Sigma^{\tilde}
|
||||||
|
We find that Y^{\tilde} = W^{-1}(K^{-1} + W)f^{\hat} and \Sigma^{tilde} = W^{-1}
|
||||||
|
|
||||||
|
Secondly,
|
||||||
|
GPy optimizes the log marginal log p(y) = -0.5*ln|K+\Sigma^{\tilde}| - 0.5*Y^{\tilde}^{T}(K^{-1} + \Sigma^{tilde})^{-1}Y + lik.Z
|
||||||
|
So we can suck up any differences between that and our log marginal likelihood approximation
|
||||||
|
p^{\squiggle}(y) = -0.5*f^{\hat}K^{-1}f^{\hat} + log p(y|f^{\hat}) - 0.5*log |K||K^{-1} + W|
|
||||||
|
which we want to optimize instead, by equating them and rearranging, the difference is added onto
|
||||||
|
the log p(y) that GPy optimizes by default
|
||||||
|
|
||||||
|
Thirdly,
|
||||||
|
Since we have gradients that depend on how we move f^{\hat}, we have implicit components
|
||||||
|
aswell as the explicit dL_dK, we hold these differences in dZ_dK and add them to dL_dK in the
|
||||||
|
gp.py code
|
||||||
|
"""
|
||||||
|
Wi = 1.0/self.W
|
||||||
|
self.Sigma_tilde = np.diagflat(Wi)
|
||||||
|
|
||||||
|
Y_tilde = Wi*self.Ki_f + self.f_hat
|
||||||
|
|
||||||
|
self.Wi_K_i = self.W12BiW12
|
||||||
|
ln_det_Wi_K = pddet(self.Sigma_tilde + self.K)
|
||||||
|
lik = self.noise_model.logpdf(self.f_hat, self.data, extra_data=self.extra_data)
|
||||||
|
y_Wi_K_i_y = mdot(Y_tilde.T, self.Wi_K_i, Y_tilde)
|
||||||
|
|
||||||
|
Z_tilde = (+ lik
|
||||||
|
- 0.5*self.ln_B_det
|
||||||
|
+ 0.5*ln_det_Wi_K
|
||||||
|
- 0.5*self.f_Ki_f
|
||||||
|
+ 0.5*y_Wi_K_i_y
|
||||||
|
+ self.NORMAL_CONST
|
||||||
|
)
|
||||||
|
|
||||||
|
#Convert to float as its (1, 1) and Z must be a scalar
|
||||||
|
self.Z = np.float64(Z_tilde)
|
||||||
|
self.Y = Y_tilde
|
||||||
|
self.YYT = np.dot(self.Y, self.Y.T)
|
||||||
|
self.covariance_matrix = self.Sigma_tilde
|
||||||
|
self.precision = 1.0 / np.diag(self.covariance_matrix)[:, None]
|
||||||
|
|
||||||
|
#Compute dZ_dK which is how the approximated distributions gradients differ from the dL_dK computed for other likelihoods
|
||||||
|
self.dZ_dK = self._Kgradients()
|
||||||
|
#+ 0.5*self.Wi_K_i - 0.5*np.dot(self.Ki_f, self.Ki_f.T) #since we are not adding the K gradients explicit part theres no need to compute this again
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -6,12 +6,13 @@ from ...util.linalg import pdinv, dpotrs, tdot, dtrtrs, dpotri, symmetrify
|
||||||
|
|
||||||
class Posterior(object):
|
class Posterior(object):
|
||||||
"""
|
"""
|
||||||
An object to represent a Gaussian posterior over latent function values.
|
An object to represent a Gaussian posterior over latent function values, p(f|D).
|
||||||
This may be computed exactly for Gaussian likelihoods, or approximated for
|
This may be computed exactly for Gaussian likelihoods, or approximated for
|
||||||
non-Gaussian likelihoods.
|
non-Gaussian likelihoods.
|
||||||
|
|
||||||
The purpose of this class is to serve as an interface between the inference
|
The purpose of this class is to serve as an interface between the inference
|
||||||
schemes and the model classes.
|
schemes and the model classes. the model class can make predictions for
|
||||||
|
the function at any new point x_* by integrating over this posterior.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
def __init__(self, woodbury_chol=None, woodbury_vector=None, K=None, mean=None, cov=None, K_chol=None, woodbury_inv=None):
|
def __init__(self, woodbury_chol=None, woodbury_vector=None, K=None, mean=None, cov=None, K_chol=None, woodbury_inv=None):
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue