an afternoon's work on the laplace approximation

2026-05-12 05:22:38 +02:00 · 2014-02-05 16:23:35 +00:00 · 2014-02-05 16:23:35 +00:00 · f653bc430e
commit f653bc430e
parent 2ff7e286ee
6 changed files with 120 additions and 225 deletions
--- a/GPy/core/gp.py
+++ b/GPy/core/gp.py
@ -55,9 +55,9 @@ class GP(Model):
        self.add_parameter(self.kern)
        self.add_parameter(self.likelihood)
        self.parameters_changed()
    def parameters_changed(self):
        self.posterior, self._log_marginal_likelihood, grad_dict = self.inference_method.inference(self.kern, self.X, self.likelihood, self.Y)
        self._dL_dK = grad_dict['dL_dK']
@ -65,9 +65,6 @@ class GP(Model):
    def log_likelihood(self):
        return self._log_marginal_likelihood
    def dL_dtheta_K(self):
        return self.kern.dK_dtheta(self.posterior.dL_dK, self.X)
    def _raw_predict(self, _Xnew, which_parts='all', full_cov=False, stop=False):
        """
        Internal helper function for making predictions, does not account
--- a/GPy/core/model.py
+++ b/GPy/core/model.py
@ -325,7 +325,6 @@ class Model(Parameterized):
            if self._fail_count >= self._allowed_failures:
                raise e
            self._fail_count += 1
            import ipdb;ipdb.set_trace()
            obj_grads = np.clip(-self._transform_gradients(self._log_likelihood_gradients() + self._log_prior_gradients()), -1e100, 1e100)
        return obj_grads
--- a/GPy/core/parameterization/parameterized.py
+++ b/GPy/core/parameterization/parameterized.py
@ -74,6 +74,7 @@ class Parameterized(Constrainable, Pickleable, Observable):
        self.size = sum(p.size for p in self._parameters_)
        if not self._has_fixes():
            self._fixes_ = None
        self._param_slices_ = []
        self._connect_parameters()
        self._added_names_ = set()
        del self._in_init_
@ -213,7 +214,6 @@ class Parameterized(Constrainable, Pickleable, Observable):
            return
        i = 0
        sizes = [0]
        self._param_slices_ = []
        for p in self._parameters_:
            p._direct_parent_ = self
            p._highest_parent_ = self
@ -315,7 +315,7 @@ class Parameterized(Constrainable, Pickleable, Observable):
        return n
    def _get_params(self):
        # don't overwrite this anymore!
-        return numpy.hstack([x._get_params() for x in self._parameters_])
+        return numpy.hstack([x._get_params() for x in self._parameters_ if x.size>0])
    def _set_params(self, params, update=True):
        # don't overwrite this anymore!
        [p._set_params(params[s], update=update) for p,s in itertools.izip(self._parameters_,self._param_slices_)]
--- a/GPy/core/sparse_gp.py
+++ b/GPy/core/sparse_gp.py
@ -42,8 +42,8 @@ class SparseGP(GP):
            raise NotImplementedError, "what to do what to do?"
            print "defaulting to ", inference_method, "for latent function inference"
        self.Z = Param('inducing inputs', Z)
        self.Z = Z
        self.num_inducing = Z.shape[0]
        if not (X_variance is None):
@ -52,10 +52,8 @@ class SparseGP(GP):
        GP.__init__(self, X, Y, kernel, likelihood, inference_method=inference_method, name=name)
        self.Z = Param('inducing inputs', self.Z)
        self.add_parameter(self.Z, index=0)
-        self.add_parameter(self.kern)
+
        self.add_parameter(self.likelihood)
    def parameters_changed(self):
        self.posterior, self._log_marginal_likelihood, self.grad_dict = self.inference_method.inference(self.kern, self.X, self.X_variance, self.Z, self.likelihood, self.Y)
--- a/GPy/inference/latent_function_inference/laplace.py
+++ b/GPy/inference/latent_function_inference/laplace.py
@ -11,7 +11,8 @@
 #http://gaussianprocess.org/gpml/code.
 import numpy as np
-from ...util.linalg import mdot, jitchol, pddet, dpotrs
+from ...util.linalg import mdot, jitchol, pddet, dpotrs, dtrtrs
 from ...util.misc import param_to_array
 from functools import partial as partial_func
 from posterior import Posterior
 import warnings
@ -27,7 +28,6 @@ class LaplaceInference(object):
        (using Newton-Raphson) of the unnormalised posterior
        """
        self.NORMAL_CONST = (0.5 * np.log(2 * np.pi))
        self._mode_finding_tolerance = 1e-7
        self._mode_finding_max_iter = 40
@ -39,58 +39,133 @@ class LaplaceInference(object):
        Returns a Posterior class containing essential quantities of the posterior
        """
        #make Y a normal array!
        Y = param_to_array(Y)
        # Compute K
        K = kern.K(X)
        #Find mode
        if self.bad_fhat:
-            Ki_f_init = np.random.randn(*Y.shape)/50
+            Ki_f_init = np.zeros_like(Y)
        else:
            Ki_f_init = self._previous_Ki_fhat
-        self.f_hat, self._previous_Ki_fhat = self.rasm_mode(K, Y, likelihood, Ki_f_init, Y_metadata=Y_metadata)
+        f_hat, Ki_fhat = self.rasm_mode(K, Y, likelihood, Ki_f_init, Y_metadata=Y_metadata)
        stop
        #Compute hessian and other variables at mode
-        self._compute_likelihood_variables()
+        log_marginal, Ki_W_i, K_Wi_i, dL_dK, woodbury_vector = self.mode_computations(f_hat, Ki_fhat, K, Y, likelihood, Y_metadata)
-        likelihood.gradient = self.likelihood_gradients()
+        #likelihood.gradient = self.likelihood_gradients()
-        dL_dK = self._Kgradients()
+        kern.update_gradients_full(dL_dK, X)
        kern.update_gradients_full(dL_dK)
-        return Posterior(mean=self.f_hat, cov=self.Sigma, K=self.K), log_marginal_approx, {'dL_dK':dL_dK}
+        self._previous_Ki_fhat = Ki_fhat.copy()
        return Posterior(woodbury_vector=woodbury_vector, woodbury_inv = K_Wi_i, K=K), log_marginal, {'dL_dK':dL_dK}
-    def _shared_gradients_components(self):
+    def rasm_mode(self, K, Y, likelihood, Ki_f_init, Y_metadata=None):
        """
-        A helper function to compute some common quantities
+        Rasmussen's numerically stable mode finding
-        """
+        For nomenclature see Rasmussen & Williams 2006
-        d3lik_d3fhat = likelihood.d3logpdf_df3(self.f_hat, self.data, extra_data=self.extra_data)
+        Influenced by GPML (BSD) code, all errors are our own
        dL_dfhat = 0.5*(np.diag(self.Ki_W_i)[:, None]*d3lik_d3fhat).T #why isn't this -0.5?
        I_KW_i = np.eye(self.N) - np.dot(self.K, self.Wi_K_i)
        return dL_dfhat, I_KW_i
-    def _Kgradients(self):
+        :param K: Covariance matrix evaluated at locations X
        :type K: NxD matrix
        :param Y: The data
        :type Y: np.ndarray
        :param likelihood: the likelihood of the latent function value for the given data
        :type likelihood: a GPy.likelihood object
        :param Ki_f_init: the initial guess at the mode
        :type Ki_f_init: np.ndarray
        :param Y_metadata: information about the data, e.g. which likelihood to take from a multi-likelihood object
        :type Y_metadata: np.ndarray | None
        :returns: f_hat, mode on which to make laplace approxmiation
        :rtype: np.ndarray
        """
        Gradients with respect to prior kernel parameters dL_dK to be chained
        with dK_dthetaK to give dL_dthetaK
        :returns: dL_dK matrix
        :rtype: Matrix (1 x num_kernel_params)
        """
        dL_dfhat, I_KW_i = self._shared_gradients_components()
        dlp = likelihood.dlogpdf_df(self.f_hat, Y, extra_data=None) # TODO: how will extra data work?
-        #Explicit
+        Ki_f = Ki_f_init.copy()
-        expl_a = np.dot(self.Ki_f, self.Ki_f.T)
+        f = np.dot(K, Ki_f)
-        expl_b = self.Wi_K_i
+
-        expl = 0.5*expl_a - 0.5*expl_b
+
-        dL_dthetaK_exp = dK_dthetaK(expl, X)
+        #define the objective function (to be maximised)
        def obj(Ki_f, f):
            return -0.5*np.dot(Ki_f.flatten(), f.flatten()) + likelihood.logpdf(f, Y, extra_data=Y_metadata)
        difference = np.inf
        iteration = 0
        while difference > self._mode_finding_tolerance and iteration < self._mode_finding_max_iter:
            W = -likelihood.d2logpdf_df2(f, Y, extra_data=Y_metadata)
            W_f = W*f
            grad = likelihood.dlogpdf_df(f, Y, extra_data=Y_metadata)
            b = W_f + grad # R+W p46 line 6.
            #W12BiW12Kb, B_logdet = self._compute_B_statistics(K, W.copy(), np.dot(K, b), likelihood.log_concave)
            W12BiW12, _, _ = self._compute_B_statistics(K, W, likelihood.log_concave)
            W12BiW12Kb = np.dot(W12BiW12, np.dot(K, b))
            #Work out the DIRECTION that we want to move in, but don't choose the stepsize yet
            full_step_Ki_f = b - W12BiW12Kb # full_step_Ki_f = a in R&W p46 line 6.
            dKi_f = full_step_Ki_f - Ki_f
            #define an objective for the line search (minimize this one)
            def inner_obj(step_size):
                Ki_f_trial = Ki_f + step_size*dKi_f
                f_trial = np.dot(K, Ki_f_trial)
                return -obj(Ki_f_trial, f_trial)
            #use scipy for the line search, the compute new values of f, Ki_f
            step = optimize.brent(inner_obj, tol=1e-4, maxiter=12)
            Ki_f_new = Ki_f + step*dKi_f
            f_new = np.dot(K, Ki_f_new)
            difference = np.abs(np.sum(f_new - f)) + np.abs(np.sum(Ki_f_new - Ki_f))
            Ki_f = Ki_f_new
            f = f_new
            iteration += 1
        #Warn of bad fits
        if difference > self._mode_finding_tolerance:
            if not self.bad_fhat:
                warnings.warn("Not perfect f_hat fit difference: {}".format(difference))
            self.bad_fhat = True
        elif self.bad_fhat:
            self.bad_fhat = False
            warnings.warn("f_hat now fine again")
        return f, Ki_f
    def mode_computations(self, f_hat, Ki_f, K, Y, likelihood, Y_metadata):
        """
        At the mode, compute the hessian and effective covariance matrix.
        returns: logZ : approximation to the marginal likelihood
        Cov : the approximation to the covariance matrix
        """
        #At this point get the hessian matrix (or vector as W is diagonal)
        W = -likelihood.d2logpdf_df2(f_hat, Y, extra_data=Y_metadata)
        K_Wi_i, L, LiW12 = self._compute_B_statistics(K, W, likelihood.log_concave)
        #compute vital matrices
        C = np.dot(LiW12, K)
        Ki_W_i  = K - C.T.dot(C)
        #compute the log marginal
        log_marginal = -0.5*np.dot(Ki_f.flatten(), f_hat.flatten()) + likelihood.logpdf(f_hat, Y, extra_data=Y_metadata) - np.sum(np.log(np.diag(L)))
        #compute dL_dK
        explicit_part = 0.5*(np.dot(Ki_f, Ki_f.T) - K_Wi_i)
        #Implicit
-        impl = mdot(dlp, dL_dfhat, I_KW_i)
+        d3lik_d3fhat = likelihood.d3logpdf_df3(f_hat, Y, extra_data=Y_metadata)
        dL_dfhat = 0.5*(np.diag(Ki_W_i)[:, None]*d3lik_d3fhat) #why isn't this -0.5? s2 in R&W p126 line 9.
        woodbury_vector = likelihood.dlogpdf_df(f_hat, Y, extra_data=Y_metadata)
        implicit_part = np.dot(woodbury_vector, dL_dfhat.T).dot(np.eye(Y.shape[0]) - np.dot(K, K_Wi_i))
-        dL_dK = expl + impl
+        dL_dK = explicit_part + implicit_part
        return log_marginal, Ki_W_i, K_Wi_i, dL_dK, woodbury_vector
        return dL_dK
    def likelihood_gradients(self):
        """
@ -118,24 +193,7 @@ class LaplaceInference(object):
        return dL_dthetaL
-    def _compute_likelihood_variables(self):
+    def _compute_B_statistics(self, K, W, log_concave):
        """
        At the mode, compute the hessian and effective covaraince matrix.
        """
        #At this point get the hessian matrix (or vector as W is diagonal)
        self.W = -likelihood.d2logpdf_df2(self.f_hat, self.data, extra_data=self.extra_data)
        if not likelihood.log_concave:
            #print "Under 1e-10: {}".format(np.sum(self.W < 1e-6))
            self.W[self.W < 1e-6] = 1e-6  # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
        self.W12BiW12, self.ln_B_det = self._compute_B_statistics(self.K, self.W, np.eye(self.N), likelihood.log_concave)
        self.Ki_f = self.Ki_f
        self.f_Ki_f = np.dot(self.f_hat.T, self.Ki_f)
        self.Ki_W_i = self.K - mdot(self.K, self.W12BiW12, self.K)
    def _compute_B_statistics(self, K, W, a, log_concave):
        """
        Rasmussen suggests the use of a numerically stable positive definite matrix B
        Which has a positive diagonal element and can be easyily inverted
@ -144,9 +202,7 @@ class LaplaceInference(object):
        :type K: NxN matrix
        :param W: Negative hessian at a point (diagonal matrix)
        :type W: Vector of diagonal values of hessian (1xN)
-        :param a: Matrix to calculate W12BiW12a
+        :returns: (W12BiW12, L_B, Li_W12)
        :type a: Matrix NxN
        :returns: (W12BiW12, ln_B_det)
        """
        if not log_concave:
            #print "Under 1e-10: {}".format(np.sum(W < 1e-6))
@ -155,169 +211,13 @@ class LaplaceInference(object):
                                # To cause the posterior to become less certain than the prior and likelihood,
                                # This is a property only held by non-log-concave likelihoods
        #W is diagonal so its sqrt is just the sqrt of the diagonal elements
        W_12 = np.sqrt(W)
        B = np.eye(K.shape[0]) + W_12*K*W_12.T
        L = jitchol(B)
-        W12BiW12a = W_12*dpotrs(L, np.asfortranarray(W_12*a), lower=1)[0]
+        LiW12, _ = dtrtrs(L, np.diag(W_12[:,0]), lower=1, trans=0)
-        ln_B_det = 2.*np.sum(np.log(np.diag(L)))
+        K_Wi_i = np.dot(LiW12.T, LiW12) # R = W12BiW12, in R&W p 126, eq 5.25
        return W12BiW12a, ln_B_det
    def rasm_mode(self, K, Y, likelihood, Ki_f_init, Y_metadata=None):
        """
        Rasmussen's numerically stable mode finding
        For nomenclature see Rasmussen & Williams 2006
        Influenced by GPML (BSD) code, all errors are our own
        :param K: Covariance matrix evaluated at locations X
        :type K: NxD matrix
        :param Y: The data
        :type Y: np.ndarray
        :param likelihood: the likelihood of the latent function value for the given data
        :type likelihood: a GPy.likelihood object
        :param Ki_f_init: the initial guess at the mode
        :type Ki_f_init: np.ndarray
        :param Y_metadata: information about the data, e.g. which likelihood to take from a multi-likelihood object
        :type Y_metadata: np.ndarray | None
        :returns: f_hat, mode on which to make laplace approxmiation
        :rtype: np.ndarray
        """
        ##Start f's at zero originally or if we have gone off track, try restarting
        #if self.old_Ki_f is None or self.bad_fhat:
            #old_Ki_f = np.random.rand(self.N, 1)/50.0
            ##old_Ki_f = self.Y
            #f = np.dot(K, old_Ki_f)
        #else:
            ##Start at the old best point
            #old_Ki_f = self.old_Ki_f.copy()
            #f = self.f_hat.copy()
        Ki_f = Ki_f_init.copy()
        f = np.dot(K, Ki_f)
        #define the objective function (to be maximised)
        def obj(Ki_f, f):
            return -0.5*np.dot(Ki_f.T, f) + likelihood.logpdf(f, Y, extra_data=Y_metadata)
        difference = np.inf
        i = 0
        while difference > self._mode_finding_tolerance and i < self._mode_finding_max_iter:
            W = -likelihood.d2logpdf_df2(f, Y, extra_data=Y_metadata)
            W_f = W*f
            grad = likelihood.dlogpdf_df(f, Y, extra_data=Y_metadata)
            b = W_f + grad
            W12BiW12Kb, _ = self._compute_B_statistics(K, W.copy(), np.dot(K, b), likelihood.log_concave)
            #Work out the DIRECTION that we want to move in, but don't choose the stepsize yet
            full_step_Ki_f = b - W12BiW12Kb
            dKi_f = full_step_Ki_f - Ki_f
            #define an objective for the line search
            def inner_obj(step_size):
                Ki_f_trial = Ki_f + step_size*dKi_f
                f_trial = np.dot(K, Ki_f_trial)
                print -obj(Ki_f_trial, f_trial),
                return -obj(Ki_f_trial, f_trial)
            #use scipy for the line search, the compute new values of f, Ki_f
            step = optimize.brent(inner_obj, tol=1e-4, maxiter=12)
            Ki_f_new = Ki_f + step*dKi_f
            f_new = np.dot(K, Ki_f_new)
            print ""
            print obj(Ki_f, f), obj(Ki_f_new, f_new), step
            print ""
            #i_o = partial_func(inner_obj, old_Ki_f=old_Ki_f, dKi_f=dKi_f, K=K)
            #Find the stepsize that minimizes the objective function using a brent line search
            #The tolerance and maxiter matter for speed! Seems to be best to keep them low and make more full
            #steps than get this exact then make a step, if B was bigger it might be the other way around though
            #new_obj = sp.optimize.minimize_scalar(i_o, method='brent', tol=1e-4, options={'maxiter':5}).fun
            #new_obj = sp.optimize.brent(i_o, tol=1e-4, maxiter=10)
            #f = self.tmp_f.copy()
            #Ki_f = self.tmp_Ki_f.copy()
            difference = np.abs(np.sum(f_new - f)) + np.abs(np.sum(Ki_f_new - Ki_f))
            Ki_f = Ki_f_new
            f = f_new
            i += 1
        #Warn of bad fits
        if difference > self._mode_finding_tolerance:
            if not self.bad_fhat:
                warnings.warn("Not perfect f_hat fit difference: {}".format(difference))
            self.bad_fhat = True
        elif self.bad_fhat:
            self.bad_fhat = False
            warnings.warn("f_hat now fine again")
        return f, Ki_f
    def _compute_GP_variables(self):
        """
        Generate data Y which would give the normal distribution identical
        to the laplace approximation to the posterior, but normalised
        GPy expects a likelihood to be gaussian, so need to caluclate
        the data Y^{\tilde} that makes the posterior match that found
        by a laplace approximation to a non-gaussian likelihood but with
        a gaussian likelihood
        Firstly,
        The hessian of the unormalised posterior distribution is (K^{-1} + W)^{-1},
        i.e. z*N(f|f^{\hat}, (K^{-1} + W)^{-1}) but this assumes a non-gaussian likelihood,
        we wish to find the hessian \Sigma^{\tilde}
        that has the same curvature but using our new simulated data Y^{\tilde}
        i.e. we do N(Y^{\tilde}|f^{\hat}, \Sigma^{\tilde})N(f|0, K) = z*N(f|f^{\hat}, (K^{-1} + W)^{-1})
        and we wish to find what Y^{\tilde} and \Sigma^{\tilde}
        We find that Y^{\tilde} = W^{-1}(K^{-1} + W)f^{\hat} and \Sigma^{tilde} = W^{-1}
        Secondly,
        GPy optimizes the log marginal log p(y) = -0.5*ln|K+\Sigma^{\tilde}| - 0.5*Y^{\tilde}^{T}(K^{-1} + \Sigma^{tilde})^{-1}Y + lik.Z
        So we can suck up any differences between that and our log marginal likelihood approximation
        p^{\squiggle}(y) = -0.5*f^{\hat}K^{-1}f^{\hat} + log p(y|f^{\hat}) - 0.5*log |K||K^{-1} + W|
        which we want to optimize instead, by equating them and rearranging, the difference is added onto
        the log p(y) that GPy optimizes by default
        Thirdly,
        Since we have gradients that depend on how we move f^{\hat}, we have implicit components
        aswell as the explicit dL_dK, we hold these differences in dZ_dK and add them to dL_dK in the
        gp.py code
        """
        Wi = 1.0/self.W
        self.Sigma_tilde = np.diagflat(Wi)
        Y_tilde = Wi*self.Ki_f + self.f_hat
        self.Wi_K_i = self.W12BiW12
        ln_det_Wi_K = pddet(self.Sigma_tilde + self.K)
        lik = likelihood.logpdf(self.f_hat, self.data, extra_data=self.extra_data)
        y_Wi_K_i_y = mdot(Y_tilde.T, self.Wi_K_i, Y_tilde)
        Z_tilde = (+ lik
                   - 0.5*self.ln_B_det
                   + 0.5*ln_det_Wi_K
                   - 0.5*self.f_Ki_f
                   + 0.5*y_Wi_K_i_y
                   + self.NORMAL_CONST
                  )
        #Convert to float as its (1, 1) and Z must be a scalar
        self.Z = np.float64(Z_tilde)
        self.Y = Y_tilde
        self.YYT = np.dot(self.Y, self.Y.T)
        self.covariance_matrix = self.Sigma_tilde
        self.precision = 1.0 / np.diag(self.covariance_matrix)[:, None]
        #Compute dZ_dK which is how the approximated distributions gradients differ from the dL_dK computed for other likelihoods
        self.dZ_dK = self._Kgradients()
        #+ 0.5*self.Wi_K_i - 0.5*np.dot(self.Ki_f, self.Ki_f.T) #since we are not adding the K gradients explicit part theres no need to compute this again
        return K_Wi_i, L, LiW12
--- a/GPy/inference/latent_function_inference/posterior.py
+++ b/GPy/inference/latent_function_inference/posterior.py
@ -48,6 +48,7 @@ class Posterior(object):
        if ((woodbury_chol is not None) and (woodbury_vector is not None))\
                or ((woodbury_inv is not None) and (woodbury_vector is not None))\
                or ((woodbury_inv is not None) and (mean is not None))\
                or ((mean is not None) and (cov is not None)):
            pass # we have sufficient to compute the posterior
        else: