From 399adb1b008180112fa97ee20877a109bfad9f3c Mon Sep 17 00:00:00 2001 From: James Hensman Date: Fri, 31 Jan 2014 16:59:06 +0000 Subject: [PATCH] some documenting, and fiddling with the laplace approx --- .../exact_gaussian_inference.py | 2 +- .../latent_function_inference/laplace.py | 235 ++++++------------ .../latent_function_inference/posterior.py | 5 +- 3 files changed, 86 insertions(+), 156 deletions(-) diff --git a/GPy/inference/latent_function_inference/exact_gaussian_inference.py b/GPy/inference/latent_function_inference/exact_gaussian_inference.py index 907e8485..2c3bfa6a 100644 --- a/GPy/inference/latent_function_inference/exact_gaussian_inference.py +++ b/GPy/inference/latent_function_inference/exact_gaussian_inference.py @@ -53,6 +53,6 @@ class ExactGaussianInference(object): likelihood.update_gradients(np.diag(dL_dK)) - return Posterior(LW, alpha, K), log_marginal, {'dL_dK':dL_dK} + return Posterior(woodbury_chol=LW, woodbury_vector=alpha, K=K), log_marginal, {'dL_dK':dL_dK} diff --git a/GPy/inference/latent_function_inference/laplace.py b/GPy/inference/latent_function_inference/laplace.py index fa5bb3b8..e5165da6 100644 --- a/GPy/inference/latent_function_inference/laplace.py +++ b/GPy/inference/latent_function_inference/laplace.py @@ -15,49 +15,32 @@ import scipy as sp from likelihood import likelihood from ..util.linalg import mdot, jitchol, pddet, dpotrs from functools import partial as partial_func +from posterior import Posterior import warnings class LaplaceInference(object): """Laplace approximation to a posterior""" - def __init__(self, data, noise_model, extra_data=None): + def __init__(self): """ Laplace Approximation Find the moments \hat{f} and the hessian at this point (using Newton-Raphson) of the unnormalised posterior - Compute the GP variables (i.e. generate some Y^{squiggle} and - z^{squiggle} which makes a gaussian the same as the laplace - approximation to the posterior, but normalised - - Arguments - --------- - - :param data: array of data the likelihood function is approximating - :type data: NxD - :param noise_model: likelihood function - subclass of noise_model - :type noise_model: noise_model - :param extra_data: additional data used by some likelihood functions, """ - self.data = data - self.noise_model = noise_model - self.extra_data = extra_data - #Inital values - self.N, self.D = self.data.shape - self.is_heteroscedastic = True - self.Nparams = 0 self.NORMAL_CONST = ((0.5 * self.N) * np.log(2 * np.pi)) - self.restart() - likelihood.__init__(self) def inference(self, kern, X, likelihood, Y, Y_metadata=None): """ Returns a Posterior class containing essential quantities of the posterior """ + self.N, self.D = self.data.shape + self.restart() + # Compute K self.K = kern.K(X) self.data = Y @@ -69,10 +52,11 @@ class LaplaceInference(object): #Compute hessian and other variables at mode self._compute_likelihood_variables() - #Compute fake variables replicating laplace approximation to posterior - self._compute_GP_variables() + likelihood.gradient = self.likelihood_gradients() + dL_dK = self._Kgradients() + kern.update_gradients_full(dL_dK) - return Posterior(mean=self.f_hat, cov=self.covariance_matrix, K=self.K) + return Posterior(mean=self.f_hat, cov=self.Sigma, K=self.K), log_marginal_approx, {'dL_dK':dL_dK} def restart(self): """ @@ -88,37 +72,10 @@ class LaplaceInference(object): self.old_Ki_f = None self.bad_fhat = False - def predictive_values(self,mu,var,full_cov,**noise_args): - if full_cov: - raise NotImplementedError, "Cannot make correlated predictions with an EP likelihood" - return self.noise_model.predictive_values(mu,var,**noise_args) - - def log_predictive_density(self, y_test, mu_star, var_star): - """ - Calculation of the log predictive density - - .. math: - p(y_{*}|D) = p(y_{*}|f_{*})p(f_{*}|\mu_{*}\\sigma^{2}_{*}) - - :param y_test: test observations (y_{*}) - :type y_test: (Nx1) array - :param mu_star: predictive mean of gaussian p(f_{*}|mu_{*}, var_{*}) - :type mu_star: (Nx1) array - :param var_star: predictive variance of gaussian p(f_{*}|mu_{*}, var_{*}) - :type var_star: (Nx1) array - """ - return self.noise_model.log_predictive_density(y_test, mu_star, var_star) - - def _get_params(self): - return np.asarray(self.noise_model._get_params()) - - def _get_param_names(self): - return self.noise_model._get_param_names() - - def _set_params(self, p): - return self.noise_model._set_params(p) - def _shared_gradients_components(self): + """ + A helper function to compute some common quantities + """ d3lik_d3fhat = self.noise_model.d3logpdf_df3(self.f_hat, self.data, extra_data=self.extra_data) dL_dfhat = 0.5*(np.diag(self.Ki_W_i)[:, None]*d3lik_d3fhat).T #why isn't this -0.5? I_KW_i = np.eye(self.N) - np.dot(self.K, self.Wi_K_i) @@ -132,41 +89,30 @@ class LaplaceInference(object): :rtype: Matrix (1 x num_kernel_params) """ dL_dfhat, I_KW_i = self._shared_gradients_components() - dlp = self.noise_model.dlogpdf_df(self.f_hat, self.data, extra_data=self.extra_data) + dlp = likelihood.dlogpdf_df(self.f_hat, Y, extra_data=None) # TODO: how will extra data work? #Explicit - #expl_a = np.dot(self.Ki_f, self.Ki_f.T) - #expl_b = self.Wi_K_i - #expl = 0.5*expl_a - 0.5*expl_b - #dL_dthetaK_exp = dK_dthetaK(expl, X) + expl_a = np.dot(self.Ki_f, self.Ki_f.T) + expl_b = self.Wi_K_i + expl = 0.5*expl_a - 0.5*expl_b + dL_dthetaK_exp = dK_dthetaK(expl, X) #Implicit impl = mdot(dlp, dL_dfhat, I_KW_i) - #No longer required as we are computing these in the gp already - #otherwise we would take them away and add them back - #dL_dthetaK_imp = dK_dthetaK(impl, X) - #dL_dthetaK = dL_dthetaK_exp + dL_dthetaK_imp - #dL_dK = expl + impl + dL_dK = expl + impl - #No need to compute explicit as we are computing dZ_dK to account - #for the difference between the K gradients of a normal GP, - #and the K gradients including the implicit part - dL_dK = impl return dL_dK - def _gradients(self, partial): + def likelihood_gradients(self): """ Gradients with respect to likelihood parameters (dL_dthetaL) - :param partial: Not needed by this likelihood - :type partial: lambda function :rtype: array of derivatives (1 x num_likelihood_params) """ dL_dfhat, I_KW_i = self._shared_gradients_components() dlik_dthetaL, dlik_grad_dthetaL, dlik_hess_dthetaL = self.noise_model._laplace_gradients(self.f_hat, self.data, extra_data=self.extra_data) - #len(dlik_dthetaL) num_params = len(self._get_param_names()) # make space for one derivative for each likelihood parameter dL_dthetaL = np.zeros(num_params) @@ -184,88 +130,9 @@ class LaplaceInference(object): return dL_dthetaL - def _compute_GP_variables(self): - """ - Generate data Y which would give the normal distribution identical - to the laplace approximation to the posterior, but normalised - - GPy expects a likelihood to be gaussian, so need to caluclate - the data Y^{\tilde} that makes the posterior match that found - by a laplace approximation to a non-gaussian likelihood but with - a gaussian likelihood - - Firstly, - The hessian of the unormalised posterior distribution is (K^{-1} + W)^{-1}, - i.e. z*N(f|f^{\hat}, (K^{-1} + W)^{-1}) but this assumes a non-gaussian likelihood, - we wish to find the hessian \Sigma^{\tilde} - that has the same curvature but using our new simulated data Y^{\tilde} - i.e. we do N(Y^{\tilde}|f^{\hat}, \Sigma^{\tilde})N(f|0, K) = z*N(f|f^{\hat}, (K^{-1} + W)^{-1}) - and we wish to find what Y^{\tilde} and \Sigma^{\tilde} - We find that Y^{\tilde} = W^{-1}(K^{-1} + W)f^{\hat} and \Sigma^{tilde} = W^{-1} - - Secondly, - GPy optimizes the log marginal log p(y) = -0.5*ln|K+\Sigma^{\tilde}| - 0.5*Y^{\tilde}^{T}(K^{-1} + \Sigma^{tilde})^{-1}Y + lik.Z - So we can suck up any differences between that and our log marginal likelihood approximation - p^{\squiggle}(y) = -0.5*f^{\hat}K^{-1}f^{\hat} + log p(y|f^{\hat}) - 0.5*log |K||K^{-1} + W| - which we want to optimize instead, by equating them and rearranging, the difference is added onto - the log p(y) that GPy optimizes by default - - Thirdly, - Since we have gradients that depend on how we move f^{\hat}, we have implicit components - aswell as the explicit dL_dK, we hold these differences in dZ_dK and add them to dL_dK in the - gp.py code - """ - Wi = 1.0/self.W - self.Sigma_tilde = np.diagflat(Wi) - - Y_tilde = Wi*self.Ki_f + self.f_hat - - self.Wi_K_i = self.W12BiW12 - ln_det_Wi_K = pddet(self.Sigma_tilde + self.K) - lik = self.noise_model.logpdf(self.f_hat, self.data, extra_data=self.extra_data) - y_Wi_K_i_y = mdot(Y_tilde.T, self.Wi_K_i, Y_tilde) - - Z_tilde = (+ lik - - 0.5*self.ln_B_det - + 0.5*ln_det_Wi_K - - 0.5*self.f_Ki_f - + 0.5*y_Wi_K_i_y - + self.NORMAL_CONST - ) - - #Convert to float as its (1, 1) and Z must be a scalar - self.Z = np.float64(Z_tilde) - self.Y = Y_tilde - self.YYT = np.dot(self.Y, self.Y.T) - self.covariance_matrix = self.Sigma_tilde - self.precision = 1.0 / np.diag(self.covariance_matrix)[:, None] - - #Compute dZ_dK which is how the approximated distributions gradients differ from the dL_dK computed for other likelihoods - self.dZ_dK = self._Kgradients() - #+ 0.5*self.Wi_K_i - 0.5*np.dot(self.Ki_f, self.Ki_f.T) #since we are not adding the K gradients explicit part theres no need to compute this again - - def fit_full(self, K): - """ - The laplace approximation algorithm, find K and expand hessian - For nomenclature see Rasmussen & Williams 2006 - modified for numerical stability - - :param K: Prior covariance matrix evaluated at locations X - :type K: NxN matrix - """ - self.K = K.copy() - - #Find mode - self.f_hat = self.rasm_mode(self.K) - - #Compute hessian and other variables at mode - self._compute_likelihood_variables() - - #Compute fake variables replicating laplace approximation to posterior - self._compute_GP_variables() - def _compute_likelihood_variables(self): """ - Compute the variables required to compute gaussian Y variables + At the mode, compute the hessian and effective covaraince matrix. """ #At this point get the hessian matrix (or vector as W is diagonal) self.W = -self.noise_model.d2logpdf_df2(self.f_hat, self.data, extra_data=self.extra_data) @@ -422,3 +289,65 @@ class LaplaceInference(object): self.Ki_f = Ki_f return f + + def _compute_GP_variables(self): + """ + Generate data Y which would give the normal distribution identical + to the laplace approximation to the posterior, but normalised + + GPy expects a likelihood to be gaussian, so need to caluclate + the data Y^{\tilde} that makes the posterior match that found + by a laplace approximation to a non-gaussian likelihood but with + a gaussian likelihood + + Firstly, + The hessian of the unormalised posterior distribution is (K^{-1} + W)^{-1}, + i.e. z*N(f|f^{\hat}, (K^{-1} + W)^{-1}) but this assumes a non-gaussian likelihood, + we wish to find the hessian \Sigma^{\tilde} + that has the same curvature but using our new simulated data Y^{\tilde} + i.e. we do N(Y^{\tilde}|f^{\hat}, \Sigma^{\tilde})N(f|0, K) = z*N(f|f^{\hat}, (K^{-1} + W)^{-1}) + and we wish to find what Y^{\tilde} and \Sigma^{\tilde} + We find that Y^{\tilde} = W^{-1}(K^{-1} + W)f^{\hat} and \Sigma^{tilde} = W^{-1} + + Secondly, + GPy optimizes the log marginal log p(y) = -0.5*ln|K+\Sigma^{\tilde}| - 0.5*Y^{\tilde}^{T}(K^{-1} + \Sigma^{tilde})^{-1}Y + lik.Z + So we can suck up any differences between that and our log marginal likelihood approximation + p^{\squiggle}(y) = -0.5*f^{\hat}K^{-1}f^{\hat} + log p(y|f^{\hat}) - 0.5*log |K||K^{-1} + W| + which we want to optimize instead, by equating them and rearranging, the difference is added onto + the log p(y) that GPy optimizes by default + + Thirdly, + Since we have gradients that depend on how we move f^{\hat}, we have implicit components + aswell as the explicit dL_dK, we hold these differences in dZ_dK and add them to dL_dK in the + gp.py code + """ + Wi = 1.0/self.W + self.Sigma_tilde = np.diagflat(Wi) + + Y_tilde = Wi*self.Ki_f + self.f_hat + + self.Wi_K_i = self.W12BiW12 + ln_det_Wi_K = pddet(self.Sigma_tilde + self.K) + lik = self.noise_model.logpdf(self.f_hat, self.data, extra_data=self.extra_data) + y_Wi_K_i_y = mdot(Y_tilde.T, self.Wi_K_i, Y_tilde) + + Z_tilde = (+ lik + - 0.5*self.ln_B_det + + 0.5*ln_det_Wi_K + - 0.5*self.f_Ki_f + + 0.5*y_Wi_K_i_y + + self.NORMAL_CONST + ) + + #Convert to float as its (1, 1) and Z must be a scalar + self.Z = np.float64(Z_tilde) + self.Y = Y_tilde + self.YYT = np.dot(self.Y, self.Y.T) + self.covariance_matrix = self.Sigma_tilde + self.precision = 1.0 / np.diag(self.covariance_matrix)[:, None] + + #Compute dZ_dK which is how the approximated distributions gradients differ from the dL_dK computed for other likelihoods + self.dZ_dK = self._Kgradients() + #+ 0.5*self.Wi_K_i - 0.5*np.dot(self.Ki_f, self.Ki_f.T) #since we are not adding the K gradients explicit part theres no need to compute this again + + diff --git a/GPy/inference/latent_function_inference/posterior.py b/GPy/inference/latent_function_inference/posterior.py index b3a352e5..c0974dc5 100644 --- a/GPy/inference/latent_function_inference/posterior.py +++ b/GPy/inference/latent_function_inference/posterior.py @@ -6,12 +6,13 @@ from ...util.linalg import pdinv, dpotrs, tdot, dtrtrs, dpotri, symmetrify class Posterior(object): """ - An object to represent a Gaussian posterior over latent function values. + An object to represent a Gaussian posterior over latent function values, p(f|D). This may be computed exactly for Gaussian likelihoods, or approximated for non-Gaussian likelihoods. The purpose of this class is to serve as an interface between the inference - schemes and the model classes. + schemes and the model classes. the model class can make predictions for + the function at any new point x_* by integrating over this posterior. """ def __init__(self, woodbury_chol=None, woodbury_vector=None, K=None, mean=None, cov=None, K_chol=None, woodbury_inv=None):