some documenting, and fiddling with the laplace approx

This commit is contained in:
James Hensman 2014-01-31 16:59:06 +00:00
parent 9f40ab0f83
commit 399adb1b00
3 changed files with 86 additions and 156 deletions

View file

@ -53,6 +53,6 @@ class ExactGaussianInference(object):
likelihood.update_gradients(np.diag(dL_dK)) likelihood.update_gradients(np.diag(dL_dK))
return Posterior(LW, alpha, K), log_marginal, {'dL_dK':dL_dK} return Posterior(woodbury_chol=LW, woodbury_vector=alpha, K=K), log_marginal, {'dL_dK':dL_dK}

View file

@ -15,49 +15,32 @@ import scipy as sp
from likelihood import likelihood from likelihood import likelihood
from ..util.linalg import mdot, jitchol, pddet, dpotrs from ..util.linalg import mdot, jitchol, pddet, dpotrs
from functools import partial as partial_func from functools import partial as partial_func
from posterior import Posterior
import warnings import warnings
class LaplaceInference(object): class LaplaceInference(object):
"""Laplace approximation to a posterior""" """Laplace approximation to a posterior"""
def __init__(self, data, noise_model, extra_data=None): def __init__(self):
""" """
Laplace Approximation Laplace Approximation
Find the moments \hat{f} and the hessian at this point Find the moments \hat{f} and the hessian at this point
(using Newton-Raphson) of the unnormalised posterior (using Newton-Raphson) of the unnormalised posterior
Compute the GP variables (i.e. generate some Y^{squiggle} and
z^{squiggle} which makes a gaussian the same as the laplace
approximation to the posterior, but normalised
Arguments
---------
:param data: array of data the likelihood function is approximating
:type data: NxD
:param noise_model: likelihood function - subclass of noise_model
:type noise_model: noise_model
:param extra_data: additional data used by some likelihood functions,
""" """
self.data = data
self.noise_model = noise_model
self.extra_data = extra_data
#Inital values #Inital values
self.N, self.D = self.data.shape
self.is_heteroscedastic = True
self.Nparams = 0
self.NORMAL_CONST = ((0.5 * self.N) * np.log(2 * np.pi)) self.NORMAL_CONST = ((0.5 * self.N) * np.log(2 * np.pi))
self.restart()
likelihood.__init__(self)
def inference(self, kern, X, likelihood, Y, Y_metadata=None): def inference(self, kern, X, likelihood, Y, Y_metadata=None):
""" """
Returns a Posterior class containing essential quantities of the posterior Returns a Posterior class containing essential quantities of the posterior
""" """
self.N, self.D = self.data.shape
self.restart()
# Compute K # Compute K
self.K = kern.K(X) self.K = kern.K(X)
self.data = Y self.data = Y
@ -69,10 +52,11 @@ class LaplaceInference(object):
#Compute hessian and other variables at mode #Compute hessian and other variables at mode
self._compute_likelihood_variables() self._compute_likelihood_variables()
#Compute fake variables replicating laplace approximation to posterior likelihood.gradient = self.likelihood_gradients()
self._compute_GP_variables() dL_dK = self._Kgradients()
kern.update_gradients_full(dL_dK)
return Posterior(mean=self.f_hat, cov=self.covariance_matrix, K=self.K) return Posterior(mean=self.f_hat, cov=self.Sigma, K=self.K), log_marginal_approx, {'dL_dK':dL_dK}
def restart(self): def restart(self):
""" """
@ -88,37 +72,10 @@ class LaplaceInference(object):
self.old_Ki_f = None self.old_Ki_f = None
self.bad_fhat = False self.bad_fhat = False
def predictive_values(self,mu,var,full_cov,**noise_args):
if full_cov:
raise NotImplementedError, "Cannot make correlated predictions with an EP likelihood"
return self.noise_model.predictive_values(mu,var,**noise_args)
def log_predictive_density(self, y_test, mu_star, var_star):
"""
Calculation of the log predictive density
.. math:
p(y_{*}|D) = p(y_{*}|f_{*})p(f_{*}|\mu_{*}\\sigma^{2}_{*})
:param y_test: test observations (y_{*})
:type y_test: (Nx1) array
:param mu_star: predictive mean of gaussian p(f_{*}|mu_{*}, var_{*})
:type mu_star: (Nx1) array
:param var_star: predictive variance of gaussian p(f_{*}|mu_{*}, var_{*})
:type var_star: (Nx1) array
"""
return self.noise_model.log_predictive_density(y_test, mu_star, var_star)
def _get_params(self):
return np.asarray(self.noise_model._get_params())
def _get_param_names(self):
return self.noise_model._get_param_names()
def _set_params(self, p):
return self.noise_model._set_params(p)
def _shared_gradients_components(self): def _shared_gradients_components(self):
"""
A helper function to compute some common quantities
"""
d3lik_d3fhat = self.noise_model.d3logpdf_df3(self.f_hat, self.data, extra_data=self.extra_data) d3lik_d3fhat = self.noise_model.d3logpdf_df3(self.f_hat, self.data, extra_data=self.extra_data)
dL_dfhat = 0.5*(np.diag(self.Ki_W_i)[:, None]*d3lik_d3fhat).T #why isn't this -0.5? dL_dfhat = 0.5*(np.diag(self.Ki_W_i)[:, None]*d3lik_d3fhat).T #why isn't this -0.5?
I_KW_i = np.eye(self.N) - np.dot(self.K, self.Wi_K_i) I_KW_i = np.eye(self.N) - np.dot(self.K, self.Wi_K_i)
@ -132,41 +89,30 @@ class LaplaceInference(object):
:rtype: Matrix (1 x num_kernel_params) :rtype: Matrix (1 x num_kernel_params)
""" """
dL_dfhat, I_KW_i = self._shared_gradients_components() dL_dfhat, I_KW_i = self._shared_gradients_components()
dlp = self.noise_model.dlogpdf_df(self.f_hat, self.data, extra_data=self.extra_data) dlp = likelihood.dlogpdf_df(self.f_hat, Y, extra_data=None) # TODO: how will extra data work?
#Explicit #Explicit
#expl_a = np.dot(self.Ki_f, self.Ki_f.T) expl_a = np.dot(self.Ki_f, self.Ki_f.T)
#expl_b = self.Wi_K_i expl_b = self.Wi_K_i
#expl = 0.5*expl_a - 0.5*expl_b expl = 0.5*expl_a - 0.5*expl_b
#dL_dthetaK_exp = dK_dthetaK(expl, X) dL_dthetaK_exp = dK_dthetaK(expl, X)
#Implicit #Implicit
impl = mdot(dlp, dL_dfhat, I_KW_i) impl = mdot(dlp, dL_dfhat, I_KW_i)
#No longer required as we are computing these in the gp already dL_dK = expl + impl
#otherwise we would take them away and add them back
#dL_dthetaK_imp = dK_dthetaK(impl, X)
#dL_dthetaK = dL_dthetaK_exp + dL_dthetaK_imp
#dL_dK = expl + impl
#No need to compute explicit as we are computing dZ_dK to account
#for the difference between the K gradients of a normal GP,
#and the K gradients including the implicit part
dL_dK = impl
return dL_dK return dL_dK
def _gradients(self, partial): def likelihood_gradients(self):
""" """
Gradients with respect to likelihood parameters (dL_dthetaL) Gradients with respect to likelihood parameters (dL_dthetaL)
:param partial: Not needed by this likelihood
:type partial: lambda function
:rtype: array of derivatives (1 x num_likelihood_params) :rtype: array of derivatives (1 x num_likelihood_params)
""" """
dL_dfhat, I_KW_i = self._shared_gradients_components() dL_dfhat, I_KW_i = self._shared_gradients_components()
dlik_dthetaL, dlik_grad_dthetaL, dlik_hess_dthetaL = self.noise_model._laplace_gradients(self.f_hat, self.data, extra_data=self.extra_data) dlik_dthetaL, dlik_grad_dthetaL, dlik_hess_dthetaL = self.noise_model._laplace_gradients(self.f_hat, self.data, extra_data=self.extra_data)
#len(dlik_dthetaL)
num_params = len(self._get_param_names()) num_params = len(self._get_param_names())
# make space for one derivative for each likelihood parameter # make space for one derivative for each likelihood parameter
dL_dthetaL = np.zeros(num_params) dL_dthetaL = np.zeros(num_params)
@ -184,88 +130,9 @@ class LaplaceInference(object):
return dL_dthetaL return dL_dthetaL
def _compute_GP_variables(self):
"""
Generate data Y which would give the normal distribution identical
to the laplace approximation to the posterior, but normalised
GPy expects a likelihood to be gaussian, so need to caluclate
the data Y^{\tilde} that makes the posterior match that found
by a laplace approximation to a non-gaussian likelihood but with
a gaussian likelihood
Firstly,
The hessian of the unormalised posterior distribution is (K^{-1} + W)^{-1},
i.e. z*N(f|f^{\hat}, (K^{-1} + W)^{-1}) but this assumes a non-gaussian likelihood,
we wish to find the hessian \Sigma^{\tilde}
that has the same curvature but using our new simulated data Y^{\tilde}
i.e. we do N(Y^{\tilde}|f^{\hat}, \Sigma^{\tilde})N(f|0, K) = z*N(f|f^{\hat}, (K^{-1} + W)^{-1})
and we wish to find what Y^{\tilde} and \Sigma^{\tilde}
We find that Y^{\tilde} = W^{-1}(K^{-1} + W)f^{\hat} and \Sigma^{tilde} = W^{-1}
Secondly,
GPy optimizes the log marginal log p(y) = -0.5*ln|K+\Sigma^{\tilde}| - 0.5*Y^{\tilde}^{T}(K^{-1} + \Sigma^{tilde})^{-1}Y + lik.Z
So we can suck up any differences between that and our log marginal likelihood approximation
p^{\squiggle}(y) = -0.5*f^{\hat}K^{-1}f^{\hat} + log p(y|f^{\hat}) - 0.5*log |K||K^{-1} + W|
which we want to optimize instead, by equating them and rearranging, the difference is added onto
the log p(y) that GPy optimizes by default
Thirdly,
Since we have gradients that depend on how we move f^{\hat}, we have implicit components
aswell as the explicit dL_dK, we hold these differences in dZ_dK and add them to dL_dK in the
gp.py code
"""
Wi = 1.0/self.W
self.Sigma_tilde = np.diagflat(Wi)
Y_tilde = Wi*self.Ki_f + self.f_hat
self.Wi_K_i = self.W12BiW12
ln_det_Wi_K = pddet(self.Sigma_tilde + self.K)
lik = self.noise_model.logpdf(self.f_hat, self.data, extra_data=self.extra_data)
y_Wi_K_i_y = mdot(Y_tilde.T, self.Wi_K_i, Y_tilde)
Z_tilde = (+ lik
- 0.5*self.ln_B_det
+ 0.5*ln_det_Wi_K
- 0.5*self.f_Ki_f
+ 0.5*y_Wi_K_i_y
+ self.NORMAL_CONST
)
#Convert to float as its (1, 1) and Z must be a scalar
self.Z = np.float64(Z_tilde)
self.Y = Y_tilde
self.YYT = np.dot(self.Y, self.Y.T)
self.covariance_matrix = self.Sigma_tilde
self.precision = 1.0 / np.diag(self.covariance_matrix)[:, None]
#Compute dZ_dK which is how the approximated distributions gradients differ from the dL_dK computed for other likelihoods
self.dZ_dK = self._Kgradients()
#+ 0.5*self.Wi_K_i - 0.5*np.dot(self.Ki_f, self.Ki_f.T) #since we are not adding the K gradients explicit part theres no need to compute this again
def fit_full(self, K):
"""
The laplace approximation algorithm, find K and expand hessian
For nomenclature see Rasmussen & Williams 2006 - modified for numerical stability
:param K: Prior covariance matrix evaluated at locations X
:type K: NxN matrix
"""
self.K = K.copy()
#Find mode
self.f_hat = self.rasm_mode(self.K)
#Compute hessian and other variables at mode
self._compute_likelihood_variables()
#Compute fake variables replicating laplace approximation to posterior
self._compute_GP_variables()
def _compute_likelihood_variables(self): def _compute_likelihood_variables(self):
""" """
Compute the variables required to compute gaussian Y variables At the mode, compute the hessian and effective covaraince matrix.
""" """
#At this point get the hessian matrix (or vector as W is diagonal) #At this point get the hessian matrix (or vector as W is diagonal)
self.W = -self.noise_model.d2logpdf_df2(self.f_hat, self.data, extra_data=self.extra_data) self.W = -self.noise_model.d2logpdf_df2(self.f_hat, self.data, extra_data=self.extra_data)
@ -422,3 +289,65 @@ class LaplaceInference(object):
self.Ki_f = Ki_f self.Ki_f = Ki_f
return f return f
def _compute_GP_variables(self):
"""
Generate data Y which would give the normal distribution identical
to the laplace approximation to the posterior, but normalised
GPy expects a likelihood to be gaussian, so need to caluclate
the data Y^{\tilde} that makes the posterior match that found
by a laplace approximation to a non-gaussian likelihood but with
a gaussian likelihood
Firstly,
The hessian of the unormalised posterior distribution is (K^{-1} + W)^{-1},
i.e. z*N(f|f^{\hat}, (K^{-1} + W)^{-1}) but this assumes a non-gaussian likelihood,
we wish to find the hessian \Sigma^{\tilde}
that has the same curvature but using our new simulated data Y^{\tilde}
i.e. we do N(Y^{\tilde}|f^{\hat}, \Sigma^{\tilde})N(f|0, K) = z*N(f|f^{\hat}, (K^{-1} + W)^{-1})
and we wish to find what Y^{\tilde} and \Sigma^{\tilde}
We find that Y^{\tilde} = W^{-1}(K^{-1} + W)f^{\hat} and \Sigma^{tilde} = W^{-1}
Secondly,
GPy optimizes the log marginal log p(y) = -0.5*ln|K+\Sigma^{\tilde}| - 0.5*Y^{\tilde}^{T}(K^{-1} + \Sigma^{tilde})^{-1}Y + lik.Z
So we can suck up any differences between that and our log marginal likelihood approximation
p^{\squiggle}(y) = -0.5*f^{\hat}K^{-1}f^{\hat} + log p(y|f^{\hat}) - 0.5*log |K||K^{-1} + W|
which we want to optimize instead, by equating them and rearranging, the difference is added onto
the log p(y) that GPy optimizes by default
Thirdly,
Since we have gradients that depend on how we move f^{\hat}, we have implicit components
aswell as the explicit dL_dK, we hold these differences in dZ_dK and add them to dL_dK in the
gp.py code
"""
Wi = 1.0/self.W
self.Sigma_tilde = np.diagflat(Wi)
Y_tilde = Wi*self.Ki_f + self.f_hat
self.Wi_K_i = self.W12BiW12
ln_det_Wi_K = pddet(self.Sigma_tilde + self.K)
lik = self.noise_model.logpdf(self.f_hat, self.data, extra_data=self.extra_data)
y_Wi_K_i_y = mdot(Y_tilde.T, self.Wi_K_i, Y_tilde)
Z_tilde = (+ lik
- 0.5*self.ln_B_det
+ 0.5*ln_det_Wi_K
- 0.5*self.f_Ki_f
+ 0.5*y_Wi_K_i_y
+ self.NORMAL_CONST
)
#Convert to float as its (1, 1) and Z must be a scalar
self.Z = np.float64(Z_tilde)
self.Y = Y_tilde
self.YYT = np.dot(self.Y, self.Y.T)
self.covariance_matrix = self.Sigma_tilde
self.precision = 1.0 / np.diag(self.covariance_matrix)[:, None]
#Compute dZ_dK which is how the approximated distributions gradients differ from the dL_dK computed for other likelihoods
self.dZ_dK = self._Kgradients()
#+ 0.5*self.Wi_K_i - 0.5*np.dot(self.Ki_f, self.Ki_f.T) #since we are not adding the K gradients explicit part theres no need to compute this again

View file

@ -6,12 +6,13 @@ from ...util.linalg import pdinv, dpotrs, tdot, dtrtrs, dpotri, symmetrify
class Posterior(object): class Posterior(object):
""" """
An object to represent a Gaussian posterior over latent function values. An object to represent a Gaussian posterior over latent function values, p(f|D).
This may be computed exactly for Gaussian likelihoods, or approximated for This may be computed exactly for Gaussian likelihoods, or approximated for
non-Gaussian likelihoods. non-Gaussian likelihoods.
The purpose of this class is to serve as an interface between the inference The purpose of this class is to serve as an interface between the inference
schemes and the model classes. schemes and the model classes. the model class can make predictions for
the function at any new point x_* by integrating over this posterior.
""" """
def __init__(self, woodbury_chol=None, woodbury_vector=None, K=None, mean=None, cov=None, K_chol=None, woodbury_inv=None): def __init__(self, woodbury_chol=None, woodbury_vector=None, K=None, mean=None, cov=None, K_chol=None, woodbury_inv=None):