From 399adb1b008180112fa97ee20877a109bfad9f3c Mon Sep 17 00:00:00 2001
From: James Hensman <james.hensman@gmail.com>
Date: Fri, 31 Jan 2014 16:59:06 +0000
Subject: [PATCH] some documenting, and fiddling with the laplace approx

---
 .../exact_gaussian_inference.py               |   2 +-
 .../latent_function_inference/laplace.py      | 235 ++++++------------
 .../latent_function_inference/posterior.py    |   5 +-
 3 files changed, 86 insertions(+), 156 deletions(-)

diff --git a/GPy/inference/latent_function_inference/exact_gaussian_inference.py b/GPy/inference/latent_function_inference/exact_gaussian_inference.py
index 907e8485..2c3bfa6a 100644
--- a/GPy/inference/latent_function_inference/exact_gaussian_inference.py
+++ b/GPy/inference/latent_function_inference/exact_gaussian_inference.py
@@ -53,6 +53,6 @@ class ExactGaussianInference(object):
 
         likelihood.update_gradients(np.diag(dL_dK))
 
-        return Posterior(LW, alpha, K), log_marginal, {'dL_dK':dL_dK}
+        return Posterior(woodbury_chol=LW, woodbury_vector=alpha, K=K), log_marginal, {'dL_dK':dL_dK}
 
 
diff --git a/GPy/inference/latent_function_inference/laplace.py b/GPy/inference/latent_function_inference/laplace.py
index fa5bb3b8..e5165da6 100644
--- a/GPy/inference/latent_function_inference/laplace.py
+++ b/GPy/inference/latent_function_inference/laplace.py
@@ -15,49 +15,32 @@ import scipy as sp
 from likelihood import likelihood
 from ..util.linalg import mdot, jitchol, pddet, dpotrs
 from functools import partial as partial_func
+from posterior import Posterior
 import warnings
 
 class LaplaceInference(object):
     """Laplace approximation to a posterior"""
 
-    def __init__(self, data, noise_model, extra_data=None):
+    def __init__(self):
         """
         Laplace Approximation
 
         Find the moments \hat{f} and the hessian at this point
         (using Newton-Raphson) of the unnormalised posterior
 
-        Compute the GP variables (i.e. generate some Y^{squiggle} and
-        z^{squiggle} which makes a gaussian the same as the laplace
-        approximation to the posterior, but normalised
-
-        Arguments
-        ---------
-
-        :param data: array of data the likelihood function is approximating
-        :type data: NxD
-        :param noise_model: likelihood function - subclass of noise_model
-        :type noise_model: noise_model
-        :param extra_data: additional data used by some likelihood functions,
         """
-        self.data = data
-        self.noise_model = noise_model
-        self.extra_data = extra_data
-
         #Inital values
-        self.N, self.D = self.data.shape
-        self.is_heteroscedastic = True
-        self.Nparams = 0
         self.NORMAL_CONST = ((0.5 * self.N) * np.log(2 * np.pi))
 
-        self.restart()
-        likelihood.__init__(self)
 
     def inference(self, kern, X, likelihood, Y, Y_metadata=None):
         """
         Returns a Posterior class containing essential quantities of the posterior
         """
 
+        self.N, self.D = self.data.shape
+        self.restart()
+
         # Compute K
         self.K = kern.K(X)
         self.data = Y
@@ -69,10 +52,11 @@ class LaplaceInference(object):
         #Compute hessian and other variables at mode
         self._compute_likelihood_variables()
 
-        #Compute fake variables replicating laplace approximation to posterior
-        self._compute_GP_variables()
+        likelihood.gradient = self.likelihood_gradients()
+        dL_dK = self._Kgradients()
+        kern.update_gradients_full(dL_dK)
 
-        return Posterior(mean=self.f_hat, cov=self.covariance_matrix, K=self.K)
+        return Posterior(mean=self.f_hat, cov=self.Sigma, K=self.K), log_marginal_approx, {'dL_dK':dL_dK}
 
     def restart(self):
         """
@@ -88,37 +72,10 @@ class LaplaceInference(object):
         self.old_Ki_f = None
         self.bad_fhat = False
 
-    def predictive_values(self,mu,var,full_cov,**noise_args):
-        if full_cov:
-            raise NotImplementedError, "Cannot make correlated predictions with an EP likelihood"
-        return self.noise_model.predictive_values(mu,var,**noise_args)
-
-    def log_predictive_density(self, y_test, mu_star, var_star):
-        """
-        Calculation of the log predictive density
-
-        .. math:
-            p(y_{*}|D) = p(y_{*}|f_{*})p(f_{*}|\mu_{*}\\sigma^{2}_{*})
-
-        :param y_test: test observations (y_{*})
-        :type y_test: (Nx1) array
-        :param mu_star: predictive mean of gaussian p(f_{*}|mu_{*}, var_{*})
-        :type mu_star: (Nx1) array
-        :param var_star: predictive variance of gaussian p(f_{*}|mu_{*}, var_{*})
-        :type var_star: (Nx1) array
-        """
-        return self.noise_model.log_predictive_density(y_test, mu_star, var_star)
-
-    def _get_params(self):
-        return np.asarray(self.noise_model._get_params())
-
-    def _get_param_names(self):
-        return self.noise_model._get_param_names()
-
-    def _set_params(self, p):
-        return self.noise_model._set_params(p)
-
     def _shared_gradients_components(self):
+        """
+        A helper function to compute some common quantities
+        """
         d3lik_d3fhat = self.noise_model.d3logpdf_df3(self.f_hat, self.data, extra_data=self.extra_data)
         dL_dfhat = 0.5*(np.diag(self.Ki_W_i)[:, None]*d3lik_d3fhat).T #why isn't this -0.5?
         I_KW_i = np.eye(self.N) - np.dot(self.K, self.Wi_K_i)
@@ -132,41 +89,30 @@ class LaplaceInference(object):
         :rtype: Matrix (1 x num_kernel_params)
         """
         dL_dfhat, I_KW_i = self._shared_gradients_components()
-        dlp = self.noise_model.dlogpdf_df(self.f_hat, self.data, extra_data=self.extra_data)
+        dlp = likelihood.dlogpdf_df(self.f_hat, Y, extra_data=None) # TODO: how will extra data work?
 
         #Explicit
-        #expl_a = np.dot(self.Ki_f, self.Ki_f.T)
-        #expl_b = self.Wi_K_i
-        #expl = 0.5*expl_a - 0.5*expl_b
-        #dL_dthetaK_exp = dK_dthetaK(expl, X)
+        expl_a = np.dot(self.Ki_f, self.Ki_f.T)
+        expl_b = self.Wi_K_i
+        expl = 0.5*expl_a - 0.5*expl_b
+        dL_dthetaK_exp = dK_dthetaK(expl, X)
 
         #Implicit
         impl = mdot(dlp, dL_dfhat, I_KW_i)
 
-        #No longer required as we are computing these in the gp already
-        #otherwise we would take them away and add them back
-        #dL_dthetaK_imp = dK_dthetaK(impl, X)
-        #dL_dthetaK = dL_dthetaK_exp + dL_dthetaK_imp
-        #dL_dK = expl + impl
+        dL_dK = expl + impl
 
-        #No need to compute explicit as we are computing dZ_dK to account
-        #for the difference between the K gradients of a normal GP,
-        #and the K gradients including the implicit part
-        dL_dK = impl
         return dL_dK
 
-    def _gradients(self, partial):
+    def likelihood_gradients(self):
         """
         Gradients with respect to likelihood parameters (dL_dthetaL)
 
-        :param partial: Not needed by this likelihood
-        :type partial: lambda function
         :rtype: array of derivatives (1 x num_likelihood_params)
         """
         dL_dfhat, I_KW_i = self._shared_gradients_components()
         dlik_dthetaL, dlik_grad_dthetaL, dlik_hess_dthetaL = self.noise_model._laplace_gradients(self.f_hat, self.data, extra_data=self.extra_data)
 
-        #len(dlik_dthetaL)
         num_params = len(self._get_param_names())
         # make space for one derivative for each likelihood parameter
         dL_dthetaL = np.zeros(num_params)
@@ -184,88 +130,9 @@ class LaplaceInference(object):
 
         return dL_dthetaL
 
-    def _compute_GP_variables(self):
-        """
-        Generate data Y which would give the normal distribution identical
-        to the laplace approximation to the posterior, but normalised
-
-        GPy expects a likelihood to be gaussian, so need to caluclate
-        the data Y^{\tilde} that makes the posterior match that found
-        by a laplace approximation to a non-gaussian likelihood but with
-        a gaussian likelihood
-
-        Firstly,
-        The hessian of the unormalised posterior distribution is (K^{-1} + W)^{-1},
-        i.e. z*N(f|f^{\hat}, (K^{-1} + W)^{-1}) but this assumes a non-gaussian likelihood,
-        we wish to find the hessian \Sigma^{\tilde}
-        that has the same curvature but using our new simulated data Y^{\tilde}
-        i.e. we do N(Y^{\tilde}|f^{\hat}, \Sigma^{\tilde})N(f|0, K) = z*N(f|f^{\hat}, (K^{-1} + W)^{-1})
-        and we wish to find what Y^{\tilde} and \Sigma^{\tilde}
-        We find that Y^{\tilde} = W^{-1}(K^{-1} + W)f^{\hat} and \Sigma^{tilde} = W^{-1}
-
-        Secondly,
-        GPy optimizes the log marginal log p(y) = -0.5*ln|K+\Sigma^{\tilde}| - 0.5*Y^{\tilde}^{T}(K^{-1} + \Sigma^{tilde})^{-1}Y + lik.Z
-        So we can suck up any differences between that and our log marginal likelihood approximation
-        p^{\squiggle}(y) = -0.5*f^{\hat}K^{-1}f^{\hat} + log p(y|f^{\hat}) - 0.5*log |K||K^{-1} + W|
-        which we want to optimize instead, by equating them and rearranging, the difference is added onto
-        the log p(y) that GPy optimizes by default
-
-        Thirdly,
-        Since we have gradients that depend on how we move f^{\hat}, we have implicit components
-        aswell as the explicit dL_dK, we hold these differences in dZ_dK and add them to dL_dK in the
-        gp.py code
-        """
-        Wi = 1.0/self.W
-        self.Sigma_tilde = np.diagflat(Wi)
-
-        Y_tilde = Wi*self.Ki_f + self.f_hat
-
-        self.Wi_K_i = self.W12BiW12
-        ln_det_Wi_K = pddet(self.Sigma_tilde + self.K)
-        lik = self.noise_model.logpdf(self.f_hat, self.data, extra_data=self.extra_data)
-        y_Wi_K_i_y = mdot(Y_tilde.T, self.Wi_K_i, Y_tilde)
-
-        Z_tilde = (+ lik
-                   - 0.5*self.ln_B_det
-                   + 0.5*ln_det_Wi_K
-                   - 0.5*self.f_Ki_f
-                   + 0.5*y_Wi_K_i_y
-                   + self.NORMAL_CONST
-                  )
-
-        #Convert to float as its (1, 1) and Z must be a scalar
-        self.Z = np.float64(Z_tilde)
-        self.Y = Y_tilde
-        self.YYT = np.dot(self.Y, self.Y.T)
-        self.covariance_matrix = self.Sigma_tilde
-        self.precision = 1.0 / np.diag(self.covariance_matrix)[:, None]
-
-        #Compute dZ_dK which is how the approximated distributions gradients differ from the dL_dK computed for other likelihoods
-        self.dZ_dK = self._Kgradients()
-        #+ 0.5*self.Wi_K_i - 0.5*np.dot(self.Ki_f, self.Ki_f.T) #since we are not adding the K gradients explicit part theres no need to compute this again
-
-    def fit_full(self, K):
-        """
-        The laplace approximation algorithm, find K and expand hessian
-        For nomenclature see Rasmussen & Williams 2006 - modified for numerical stability
-
-        :param K: Prior covariance matrix evaluated at locations X
-        :type K: NxN matrix
-        """
-        self.K = K.copy()
-
-        #Find mode
-        self.f_hat = self.rasm_mode(self.K)
-
-        #Compute hessian and other variables at mode
-        self._compute_likelihood_variables()
-
-        #Compute fake variables replicating laplace approximation to posterior
-        self._compute_GP_variables()
-
     def _compute_likelihood_variables(self):
         """
-        Compute the variables required to compute gaussian Y variables
+        At the mode, compute the hessian and effective covaraince matrix.
         """
         #At this point get the hessian matrix (or vector as W is diagonal)
         self.W = -self.noise_model.d2logpdf_df2(self.f_hat, self.data, extra_data=self.extra_data)
@@ -422,3 +289,65 @@ class LaplaceInference(object):
 
         self.Ki_f = Ki_f
         return f
+
+    def _compute_GP_variables(self):
+        """
+        Generate data Y which would give the normal distribution identical
+        to the laplace approximation to the posterior, but normalised
+
+        GPy expects a likelihood to be gaussian, so need to caluclate
+        the data Y^{\tilde} that makes the posterior match that found
+        by a laplace approximation to a non-gaussian likelihood but with
+        a gaussian likelihood
+
+        Firstly,
+        The hessian of the unormalised posterior distribution is (K^{-1} + W)^{-1},
+        i.e. z*N(f|f^{\hat}, (K^{-1} + W)^{-1}) but this assumes a non-gaussian likelihood,
+        we wish to find the hessian \Sigma^{\tilde}
+        that has the same curvature but using our new simulated data Y^{\tilde}
+        i.e. we do N(Y^{\tilde}|f^{\hat}, \Sigma^{\tilde})N(f|0, K) = z*N(f|f^{\hat}, (K^{-1} + W)^{-1})
+        and we wish to find what Y^{\tilde} and \Sigma^{\tilde}
+        We find that Y^{\tilde} = W^{-1}(K^{-1} + W)f^{\hat} and \Sigma^{tilde} = W^{-1}
+
+        Secondly,
+        GPy optimizes the log marginal log p(y) = -0.5*ln|K+\Sigma^{\tilde}| - 0.5*Y^{\tilde}^{T}(K^{-1} + \Sigma^{tilde})^{-1}Y + lik.Z
+        So we can suck up any differences between that and our log marginal likelihood approximation
+        p^{\squiggle}(y) = -0.5*f^{\hat}K^{-1}f^{\hat} + log p(y|f^{\hat}) - 0.5*log |K||K^{-1} + W|
+        which we want to optimize instead, by equating them and rearranging, the difference is added onto
+        the log p(y) that GPy optimizes by default
+
+        Thirdly,
+        Since we have gradients that depend on how we move f^{\hat}, we have implicit components
+        aswell as the explicit dL_dK, we hold these differences in dZ_dK and add them to dL_dK in the
+        gp.py code
+        """
+        Wi = 1.0/self.W
+        self.Sigma_tilde = np.diagflat(Wi)
+
+        Y_tilde = Wi*self.Ki_f + self.f_hat
+
+        self.Wi_K_i = self.W12BiW12
+        ln_det_Wi_K = pddet(self.Sigma_tilde + self.K)
+        lik = self.noise_model.logpdf(self.f_hat, self.data, extra_data=self.extra_data)
+        y_Wi_K_i_y = mdot(Y_tilde.T, self.Wi_K_i, Y_tilde)
+
+        Z_tilde = (+ lik
+                   - 0.5*self.ln_B_det
+                   + 0.5*ln_det_Wi_K
+                   - 0.5*self.f_Ki_f
+                   + 0.5*y_Wi_K_i_y
+                   + self.NORMAL_CONST
+                  )
+
+        #Convert to float as its (1, 1) and Z must be a scalar
+        self.Z = np.float64(Z_tilde)
+        self.Y = Y_tilde
+        self.YYT = np.dot(self.Y, self.Y.T)
+        self.covariance_matrix = self.Sigma_tilde
+        self.precision = 1.0 / np.diag(self.covariance_matrix)[:, None]
+
+        #Compute dZ_dK which is how the approximated distributions gradients differ from the dL_dK computed for other likelihoods
+        self.dZ_dK = self._Kgradients()
+        #+ 0.5*self.Wi_K_i - 0.5*np.dot(self.Ki_f, self.Ki_f.T) #since we are not adding the K gradients explicit part theres no need to compute this again
+
+
diff --git a/GPy/inference/latent_function_inference/posterior.py b/GPy/inference/latent_function_inference/posterior.py
index b3a352e5..c0974dc5 100644
--- a/GPy/inference/latent_function_inference/posterior.py
+++ b/GPy/inference/latent_function_inference/posterior.py
@@ -6,12 +6,13 @@ from ...util.linalg import pdinv, dpotrs, tdot, dtrtrs, dpotri, symmetrify
 
 class Posterior(object):
     """
-    An object to represent a Gaussian posterior over latent function values.
+    An object to represent a Gaussian posterior over latent function values, p(f|D).
     This may be computed exactly for Gaussian likelihoods, or approximated for
     non-Gaussian likelihoods.
 
     The purpose of this class is to serve as an interface between the inference
-    schemes and the model classes.
+    schemes and the model classes.  the model class can make predictions for
+    the function at any new point x_* by integrating over this posterior.
 
     """
     def __init__(self, woodbury_chol=None, woodbury_vector=None, K=None, mean=None, cov=None, K_chol=None, woodbury_inv=None):