some documenting, and fiddling with the laplace approx

2026-05-30 14:35:15 +02:00 · 2014-01-31 16:59:06 +00:00 · 2014-01-31 16:59:06 +00:00 · 399adb1b00
commit 399adb1b00
parent 9f40ab0f83
3 changed files with 86 additions and 156 deletions
--- a/GPy/inference/latent_function_inference/exact_gaussian_inference.py
+++ b/GPy/inference/latent_function_inference/exact_gaussian_inference.py
@ -53,6 +53,6 @@ class ExactGaussianInference(object):
        likelihood.update_gradients(np.diag(dL_dK))
-        return Posterior(LW, alpha, K), log_marginal, {'dL_dK':dL_dK}
+        return Posterior(woodbury_chol=LW, woodbury_vector=alpha, K=K), log_marginal, {'dL_dK':dL_dK}
--- a/GPy/inference/latent_function_inference/laplace.py
+++ b/GPy/inference/latent_function_inference/laplace.py
@ -15,49 +15,32 @@ import scipy as sp
 from likelihood import likelihood
 from ..util.linalg import mdot, jitchol, pddet, dpotrs
 from functools import partial as partial_func
 from posterior import Posterior
 import warnings
 class LaplaceInference(object):
    """Laplace approximation to a posterior"""
-    def __init__(self, data, noise_model, extra_data=None):
+    def __init__(self):
        """
        Laplace Approximation
        Find the moments \hat{f} and the hessian at this point
        (using Newton-Raphson) of the unnormalised posterior
        Compute the GP variables (i.e. generate some Y^{squiggle} and
        z^{squiggle} which makes a gaussian the same as the laplace
        approximation to the posterior, but normalised
        Arguments
        ---------
        :param data: array of data the likelihood function is approximating
        :type data: NxD
        :param noise_model: likelihood function - subclass of noise_model
        :type noise_model: noise_model
        :param extra_data: additional data used by some likelihood functions,
        """
        self.data = data
        self.noise_model = noise_model
        self.extra_data = extra_data
        #Inital values
        self.N, self.D = self.data.shape
        self.is_heteroscedastic = True
        self.Nparams = 0
        self.NORMAL_CONST = ((0.5 * self.N) * np.log(2 * np.pi))
        self.restart()
        likelihood.__init__(self)
    def inference(self, kern, X, likelihood, Y, Y_metadata=None):
        """
        Returns a Posterior class containing essential quantities of the posterior
        """
        self.N, self.D = self.data.shape
        self.restart()
        # Compute K
        self.K = kern.K(X)
        self.data = Y
@ -69,10 +52,11 @@ class LaplaceInference(object):
        #Compute hessian and other variables at mode
        self._compute_likelihood_variables()
-        #Compute fake variables replicating laplace approximation to posterior
+        likelihood.gradient = self.likelihood_gradients()
-        self._compute_GP_variables()
+        dL_dK = self._Kgradients()
        kern.update_gradients_full(dL_dK)
-        return Posterior(mean=self.f_hat, cov=self.covariance_matrix, K=self.K)
+        return Posterior(mean=self.f_hat, cov=self.Sigma, K=self.K), log_marginal_approx, {'dL_dK':dL_dK}
    def restart(self):
        """
@ -88,37 +72,10 @@ class LaplaceInference(object):
        self.old_Ki_f = None
        self.bad_fhat = False
    def predictive_values(self,mu,var,full_cov,**noise_args):
        if full_cov:
            raise NotImplementedError, "Cannot make correlated predictions with an EP likelihood"
        return self.noise_model.predictive_values(mu,var,**noise_args)
    def log_predictive_density(self, y_test, mu_star, var_star):
        """
        Calculation of the log predictive density
        .. math:
            p(y_{*}|D) = p(y_{*}|f_{*})p(f_{*}|\mu_{*}\\sigma^{2}_{*})
        :param y_test: test observations (y_{*})
        :type y_test: (Nx1) array
        :param mu_star: predictive mean of gaussian p(f_{*}|mu_{*}, var_{*})
        :type mu_star: (Nx1) array
        :param var_star: predictive variance of gaussian p(f_{*}|mu_{*}, var_{*})
        :type var_star: (Nx1) array
        """
        return self.noise_model.log_predictive_density(y_test, mu_star, var_star)
    def _get_params(self):
        return np.asarray(self.noise_model._get_params())
    def _get_param_names(self):
        return self.noise_model._get_param_names()
    def _set_params(self, p):
        return self.noise_model._set_params(p)
    def _shared_gradients_components(self):
        """
        A helper function to compute some common quantities
        """
        d3lik_d3fhat = self.noise_model.d3logpdf_df3(self.f_hat, self.data, extra_data=self.extra_data)
        dL_dfhat = 0.5*(np.diag(self.Ki_W_i)[:, None]*d3lik_d3fhat).T #why isn't this -0.5?
        I_KW_i = np.eye(self.N) - np.dot(self.K, self.Wi_K_i)
@ -132,41 +89,30 @@ class LaplaceInference(object):
        :rtype: Matrix (1 x num_kernel_params)
        """
        dL_dfhat, I_KW_i = self._shared_gradients_components()
-        dlp = self.noise_model.dlogpdf_df(self.f_hat, self.data, extra_data=self.extra_data)
+        dlp = likelihood.dlogpdf_df(self.f_hat, Y, extra_data=None) # TODO: how will extra data work?
        #Explicit
-        #expl_a = np.dot(self.Ki_f, self.Ki_f.T)
+        expl_a = np.dot(self.Ki_f, self.Ki_f.T)
-        #expl_b = self.Wi_K_i
+        expl_b = self.Wi_K_i
-        #expl = 0.5*expl_a - 0.5*expl_b
+        expl = 0.5*expl_a - 0.5*expl_b
-        #dL_dthetaK_exp = dK_dthetaK(expl, X)
+        dL_dthetaK_exp = dK_dthetaK(expl, X)
        #Implicit
        impl = mdot(dlp, dL_dfhat, I_KW_i)
-        #No longer required as we are computing these in the gp already
+        dL_dK = expl + impl
        #otherwise we would take them away and add them back
        #dL_dthetaK_imp = dK_dthetaK(impl, X)
        #dL_dthetaK = dL_dthetaK_exp + dL_dthetaK_imp
        #dL_dK = expl + impl
        #No need to compute explicit as we are computing dZ_dK to account
        #for the difference between the K gradients of a normal GP,
        #and the K gradients including the implicit part
        dL_dK = impl
        return dL_dK
-    def _gradients(self, partial):
+    def likelihood_gradients(self):
        """
        Gradients with respect to likelihood parameters (dL_dthetaL)
        :param partial: Not needed by this likelihood
        :type partial: lambda function
        :rtype: array of derivatives (1 x num_likelihood_params)
        """
        dL_dfhat, I_KW_i = self._shared_gradients_components()
        dlik_dthetaL, dlik_grad_dthetaL, dlik_hess_dthetaL = self.noise_model._laplace_gradients(self.f_hat, self.data, extra_data=self.extra_data)
        #len(dlik_dthetaL)
        num_params = len(self._get_param_names())
        # make space for one derivative for each likelihood parameter
        dL_dthetaL = np.zeros(num_params)
@ -184,88 +130,9 @@ class LaplaceInference(object):
        return dL_dthetaL
    def _compute_GP_variables(self):
        """
        Generate data Y which would give the normal distribution identical
        to the laplace approximation to the posterior, but normalised
        GPy expects a likelihood to be gaussian, so need to caluclate
        the data Y^{\tilde} that makes the posterior match that found
        by a laplace approximation to a non-gaussian likelihood but with
        a gaussian likelihood
        Firstly,
        The hessian of the unormalised posterior distribution is (K^{-1} + W)^{-1},
        i.e. z*N(f|f^{\hat}, (K^{-1} + W)^{-1}) but this assumes a non-gaussian likelihood,
        we wish to find the hessian \Sigma^{\tilde}
        that has the same curvature but using our new simulated data Y^{\tilde}
        i.e. we do N(Y^{\tilde}|f^{\hat}, \Sigma^{\tilde})N(f|0, K) = z*N(f|f^{\hat}, (K^{-1} + W)^{-1})
        and we wish to find what Y^{\tilde} and \Sigma^{\tilde}
        We find that Y^{\tilde} = W^{-1}(K^{-1} + W)f^{\hat} and \Sigma^{tilde} = W^{-1}
        Secondly,
        GPy optimizes the log marginal log p(y) = -0.5*ln|K+\Sigma^{\tilde}| - 0.5*Y^{\tilde}^{T}(K^{-1} + \Sigma^{tilde})^{-1}Y + lik.Z
        So we can suck up any differences between that and our log marginal likelihood approximation
        p^{\squiggle}(y) = -0.5*f^{\hat}K^{-1}f^{\hat} + log p(y|f^{\hat}) - 0.5*log |K||K^{-1} + W|
        which we want to optimize instead, by equating them and rearranging, the difference is added onto
        the log p(y) that GPy optimizes by default
        Thirdly,
        Since we have gradients that depend on how we move f^{\hat}, we have implicit components
        aswell as the explicit dL_dK, we hold these differences in dZ_dK and add them to dL_dK in the
        gp.py code
        """
        Wi = 1.0/self.W
        self.Sigma_tilde = np.diagflat(Wi)
        Y_tilde = Wi*self.Ki_f + self.f_hat
        self.Wi_K_i = self.W12BiW12
        ln_det_Wi_K = pddet(self.Sigma_tilde + self.K)
        lik = self.noise_model.logpdf(self.f_hat, self.data, extra_data=self.extra_data)
        y_Wi_K_i_y = mdot(Y_tilde.T, self.Wi_K_i, Y_tilde)
        Z_tilde = (+ lik
                   - 0.5*self.ln_B_det
                   + 0.5*ln_det_Wi_K
                   - 0.5*self.f_Ki_f
                   + 0.5*y_Wi_K_i_y
                   + self.NORMAL_CONST
                  )
        #Convert to float as its (1, 1) and Z must be a scalar
        self.Z = np.float64(Z_tilde)
        self.Y = Y_tilde
        self.YYT = np.dot(self.Y, self.Y.T)
        self.covariance_matrix = self.Sigma_tilde
        self.precision = 1.0 / np.diag(self.covariance_matrix)[:, None]
        #Compute dZ_dK which is how the approximated distributions gradients differ from the dL_dK computed for other likelihoods
        self.dZ_dK = self._Kgradients()
        #+ 0.5*self.Wi_K_i - 0.5*np.dot(self.Ki_f, self.Ki_f.T) #since we are not adding the K gradients explicit part theres no need to compute this again
    def fit_full(self, K):
        """
        The laplace approximation algorithm, find K and expand hessian
        For nomenclature see Rasmussen & Williams 2006 - modified for numerical stability
        :param K: Prior covariance matrix evaluated at locations X
        :type K: NxN matrix
        """
        self.K = K.copy()
        #Find mode
        self.f_hat = self.rasm_mode(self.K)
        #Compute hessian and other variables at mode
        self._compute_likelihood_variables()
        #Compute fake variables replicating laplace approximation to posterior
        self._compute_GP_variables()
    def _compute_likelihood_variables(self):
        """
-        Compute the variables required to compute gaussian Y variables
+        At the mode, compute the hessian and effective covaraince matrix.
        """
        #At this point get the hessian matrix (or vector as W is diagonal)
        self.W = -self.noise_model.d2logpdf_df2(self.f_hat, self.data, extra_data=self.extra_data)
@ -422,3 +289,65 @@ class LaplaceInference(object):
        self.Ki_f = Ki_f
        return f
    def _compute_GP_variables(self):
        """
        Generate data Y which would give the normal distribution identical
        to the laplace approximation to the posterior, but normalised
        GPy expects a likelihood to be gaussian, so need to caluclate
        the data Y^{\tilde} that makes the posterior match that found
        by a laplace approximation to a non-gaussian likelihood but with
        a gaussian likelihood
        Firstly,
        The hessian of the unormalised posterior distribution is (K^{-1} + W)^{-1},
        i.e. z*N(f|f^{\hat}, (K^{-1} + W)^{-1}) but this assumes a non-gaussian likelihood,
        we wish to find the hessian \Sigma^{\tilde}
        that has the same curvature but using our new simulated data Y^{\tilde}
        i.e. we do N(Y^{\tilde}|f^{\hat}, \Sigma^{\tilde})N(f|0, K) = z*N(f|f^{\hat}, (K^{-1} + W)^{-1})
        and we wish to find what Y^{\tilde} and \Sigma^{\tilde}
        We find that Y^{\tilde} = W^{-1}(K^{-1} + W)f^{\hat} and \Sigma^{tilde} = W^{-1}
        Secondly,
        GPy optimizes the log marginal log p(y) = -0.5*ln|K+\Sigma^{\tilde}| - 0.5*Y^{\tilde}^{T}(K^{-1} + \Sigma^{tilde})^{-1}Y + lik.Z
        So we can suck up any differences between that and our log marginal likelihood approximation
        p^{\squiggle}(y) = -0.5*f^{\hat}K^{-1}f^{\hat} + log p(y|f^{\hat}) - 0.5*log |K||K^{-1} + W|
        which we want to optimize instead, by equating them and rearranging, the difference is added onto
        the log p(y) that GPy optimizes by default
        Thirdly,
        Since we have gradients that depend on how we move f^{\hat}, we have implicit components
        aswell as the explicit dL_dK, we hold these differences in dZ_dK and add them to dL_dK in the
        gp.py code
        """
        Wi = 1.0/self.W
        self.Sigma_tilde = np.diagflat(Wi)
        Y_tilde = Wi*self.Ki_f + self.f_hat
        self.Wi_K_i = self.W12BiW12
        ln_det_Wi_K = pddet(self.Sigma_tilde + self.K)
        lik = self.noise_model.logpdf(self.f_hat, self.data, extra_data=self.extra_data)
        y_Wi_K_i_y = mdot(Y_tilde.T, self.Wi_K_i, Y_tilde)
        Z_tilde = (+ lik
                   - 0.5*self.ln_B_det
                   + 0.5*ln_det_Wi_K
                   - 0.5*self.f_Ki_f
                   + 0.5*y_Wi_K_i_y
                   + self.NORMAL_CONST
                  )
        #Convert to float as its (1, 1) and Z must be a scalar
        self.Z = np.float64(Z_tilde)
        self.Y = Y_tilde
        self.YYT = np.dot(self.Y, self.Y.T)
        self.covariance_matrix = self.Sigma_tilde
        self.precision = 1.0 / np.diag(self.covariance_matrix)[:, None]
        #Compute dZ_dK which is how the approximated distributions gradients differ from the dL_dK computed for other likelihoods
        self.dZ_dK = self._Kgradients()
        #+ 0.5*self.Wi_K_i - 0.5*np.dot(self.Ki_f, self.Ki_f.T) #since we are not adding the K gradients explicit part theres no need to compute this again
--- a/GPy/inference/latent_function_inference/posterior.py
+++ b/GPy/inference/latent_function_inference/posterior.py
@ -6,12 +6,13 @@ from ...util.linalg import pdinv, dpotrs, tdot, dtrtrs, dpotri, symmetrify
 class Posterior(object):
    """
-    An object to represent a Gaussian posterior over latent function values.
+    An object to represent a Gaussian posterior over latent function values, p(f|D).
    This may be computed exactly for Gaussian likelihoods, or approximated for
    non-Gaussian likelihoods.
    The purpose of this class is to serve as an interface between the inference
-    schemes and the model classes.
+    schemes and the model classes.  the model class can make predictions for
    the function at any new point x_* by integrating over this posterior.
    """
    def __init__(self, woodbury_chol=None, woodbury_vector=None, K=None, mean=None, cov=None, K_chol=None, woodbury_inv=None):
`@ -53,6 +53,6 @@ class ExactGaussianInference(object):`

	`likelihood.update_gradients(np.diag(dL_dK))`	`likelihood.update_gradients(np.diag(dL_dK))`

	`return Posterior(LW, alpha, K), log_marginal, {'dL_dK':dL_dK}`	`return Posterior(woodbury_chol=LW, woodbury_vector=alpha, K=K), log_marginal, {'dL_dK':dL_dK}`