From 1420aa532c5df8eaf4e6db5b89e77f4b375ebf1c Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Fri, 19 Apr 2013 12:23:00 +0100
Subject: [PATCH] Attempted to introduce gradient methods, won't work yet I
 doubt

---
 GPy/examples/__init__.py                |   1 +
 GPy/likelihoods/Laplace.py              | 120 ++++++++++++++++++------
 GPy/likelihoods/likelihood_functions.py |  58 +++++++++++-
 GPy/models/GP.py                        |  16 +++-
 GPy/util/linalg.py                      |  19 +++-
 5 files changed, 177 insertions(+), 37 deletions(-)

diff --git a/GPy/examples/__init__.py b/GPy/examples/__init__.py
index 551bff54..68832e77 100644
--- a/GPy/examples/__init__.py
+++ b/GPy/examples/__init__.py
@@ -1,6 +1,7 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
+import laplace_approximations
 import classification
 import regression
 import dimensionality_reduction
diff --git a/GPy/likelihoods/Laplace.py b/GPy/likelihoods/Laplace.py
index 4d94ba0f..b1b41957 100644
--- a/GPy/likelihoods/Laplace.py
+++ b/GPy/likelihoods/Laplace.py
@@ -4,28 +4,9 @@ import GPy
 from scipy.linalg import cholesky, eig, inv, cho_solve, det
 from numpy.linalg import cond
 from GPy.likelihoods.likelihood import likelihood
-from GPy.util.linalg import pdinv, mdot, jitchol, chol_inv
+from GPy.util.linalg import pdinv, mdot, jitchol, chol_inv, det_ln_diag, pddet
 from scipy.linalg.lapack import dtrtrs
 
-#TODO: Move this to utils
-
-
-def det_ln_diag(A):
-    """
-    log determinant of a diagonal matrix
-    $$\ln |A| = \ln \prod{A_{ii}} = \sum{\ln A_{ii}}$$
-    """
-    return np.log(np.diagonal(A)).sum()
-
-
-def pddet(A):
-    """
-    Determinant of a positive definite matrix
-    """
-    L = cholesky(A)
-    logdetA = 2*sum(np.log(np.diag(L)))
-    return logdetA
-
 
 class Laplace(likelihood):
     """Laplace approximation to a posterior"""
@@ -75,17 +56,92 @@ class Laplace(likelihood):
         return self.likelihood_function.predictive_values(mu, var)
 
     def _get_params(self):
-        return np.zeros(0)
+        return np.asarray(self.likelihood_function._get_params())
 
     def _get_param_names(self):
-        return []
+        return self.likelihood_function._get_param_names()
 
     def _set_params(self, p):
-        pass  # TODO: Laplace likelihood might want to take some parameters...
+        return self.likelihood_function._set_params()
+
+    def both_gradients(self, dL_d_K_Sigma, dK_dthetaK):
+        """
+        Find the gradients of the marginal likelihood w.r.t both thetaK and thetaL
+
+        dL_dthetaK differs from that of normal likelihoods as it has additional terms coming from
+        changes to y_tilde and changes to Sigma_tilde when the kernel parameters are adjusted
+
+        Similar terms arise when finding the gradients with respect to changes in the liklihood
+        parameters
+        """
+        return (self._Kgradients(dL_d_K_Sigma, dK_dthetaK), self._gradients(dL_d_K_Sigma))
+
+    def _shared_gradients_components(self):
+        dL_dytil = -np.dot((self.K+self.Sigma_tilde), self.Y)
+        dytil_dfhat = np.dot(self.Sigma_tilde, self.Ki) + np.eye(self.N) # or self.Wi__Ki_W?
+        return dL_dytil, dytil_dfhat
+
+    def _Kgradients(self, dL_d_K_Sigma, dK_dthetaK):
+        """
+                           #explicit                #implicit                     #implicit
+        dL_dtheta_K = (dL_dK * dK_dthetaK) + (dL_dytil * dytil_dthetaK) + (dL_dSigma * dSigma_dthetaK)
+        :param dL_d_K_Sigma: Derivative of marginal with respect to K_prior+Sigma_tilde (posterior covariance)
+        :param dK_dthetaK: explcit derivative of kernel with respect to its hyper paramers
+        :returns: dL_dthetaK - gradients of marginal likelihood w.r.t changes in K hyperparameters
+        """
+        dL_dytil, dytil_dfhat = self._shared_gradients_components()
+
+        I_KW_i, _, _, _ = pdinv(np.eye(self.N) + np.dot(self.K, self.W))
+        #FIXME: Careful dK_dthetaK is not the derivative with respect to the marginal just prior K!
+        dfhat_dthetaK = I_KW_i*dK_dthetaK*self.likelihood_function.link_grad(self.data, self.f_hat, self.extra_data)
+
+        dytil_dthetaK = dytil_dfhat*dfhat_dthetaK
+
+        #FIXME: Careful dL_dK = dL_d_K_Sigma
+        #FIXME: Careful the -D*0.5 in dL_d_K_sigma might need to be -0.5?
+        dL_dSigma = dL_d_K_Sigma
+        d3phi_d3fhat = self.likelihood_function.d3link(self.data, self.f_hat, self.extra_data)
+                     #explicit           #implicit
+        dSigmai_dthetaK = 0 #+ np.sum(d3phi_d3fhat*dfhat_dthetaK) #FIXME: CAREFUL OF THIS SUM! SHOULD SUM OVER FHAT NOT THETAS
+        dSigma_dthetaK = -mdot(self.Sigma_tilde, dSigmai_dthetaK, self.Sigma_tilde)
+
+        dL_dthetaK_implicit = dL_dytil*dytil_dthetaK + dL_dSigma*dSigma_dthetaK
+        return dL_dthetaK_implicit
 
     def _gradients(self, partial):
-        return np.zeros(0)  # TODO: Laplace likelihood might want to take some parameters...
-        raise NotImplementedError
+        """
+        Gradients with respect to likelihood parameters
+
+        Complicated, it differs for parameters of the kernel \theta_{K}, and
+        parameters of the likelihood, \theta_{L}
+
+        dL_dtheta_K = (dL_dK * dK_dthetaK) + (dL_dytil * dytil_dthetaK) + (dL_dSigma * dSigma_dthetaK)
+        dL_dtheta_L = (dL_dK * dK_dthetaL) + (dL_dytil * dytil_dthetaL) + (dL_dSigma * dSigma_dthetaL)
+        dL_dK*dK_dthetaL = 0
+
+        dytil_dthetaX = dytil_dfhat * dfhat_dthetaX
+        dytil_dfhat = Sigma*Ki + I
+
+        fhat = K*log_p(y|fhat)                                          from rasm p125
+        dfhat_dthetaK = (I + KW)i * dK_dthetaK * log_p(y|fhat)          from rasm p125
+
+        dSigma_dthetaX = dWi_dthetaX = -Wi * dW_dthetaX * Wi
+        dW_dthetaX = d_dthetaX[d2phi_d2fhat]
+        d2phi_d2fhat = Hessian function of likelihood
+
+        partial = dL_dK
+        """
+        dL_dytil, dytil_dfhat = self._shared_gradients_components()
+        dfhat_dthetaL = self.likelihood_function.df_dtheta()
+
+        dSigmai_dthetaL = self.likelihood_function._gradients(self.data, self.f_hat, self.extra_data) #FIXME: Shouldn't this have a implicit component aswell?
+        dSigma_dthetaL = -mdot(self.Sigma_tilde, dSigmai_dthetaL, self.Sigma_tilde)
+        dL_dSigma = partial # partial is dL_dK but K here is K+Sigma_tilde.... which is fine in this case
+
+        dytil_dthetaL = dytil_dfhat*dfhat_dthetaL
+        dL_dthetaL = 0 + dL_dytil*dytil_dthetaL + dL_dSigma*dSigma_dthetaL
+        return dL_dthetaL
+        #return np.zeros(0)  # TODO: Laplace likelihood might want to take some parameters...
 
     def _compute_GP_variables(self):
         """
@@ -112,8 +168,9 @@ class Laplace(likelihood):
         $$\tilde{\Sigma} = W^{-1}$$
 
         """
-        epsilon = 1e-6
+        epsilon = 1e14
 
+        #Wi(Ki + W) = WiKi + I = KW_i + I = L_Lt_W_i + I = Wi_Lit_Li + I = Lt_W_i_Li + I
         #dtritri -> L -> L_i
         #dtrtrs -> L.T*W, L_i -> (L.T*W)_i*L_i
         #((L.T*w)_i + I)f_hat = y_tilde
@@ -122,11 +179,12 @@ class Laplace(likelihood):
         Lt_W = np.dot(L.T, self.W)
 
         ##Check it isn't singular!
-        if cond(Lt_W) > 1e14:
+        if cond(Lt_W) > epsilon:
             print "WARNING: L_inv.T * W matrix is singular,\nnumerical stability may be a problem"
 
         Lt_W_i_Li = dtrtrs(Lt_W, Li, lower=False)[0]
-        Y_tilde = np.dot(Lt_W_i_Li + np.eye(self.N), self.f_hat)
+        self.Wi__Ki_W = Lt_W_i_Li + np.eye(self.N)
+        Y_tilde = np.dot(self.Wi__Ki_W, self.f_hat)
 
         #f.T(Ki + W)f
         f_Ki_W_f = (np.dot(self.f_hat.T, cho_solve((L, True), self.f_hat))
@@ -156,16 +214,16 @@ class Laplace(likelihood):
                    #)
 
         ##Check it isn't singular!
-        if cond(self.W) > 1e14:
+        if cond(self.W) > epsilon:
             print "WARNING: Transformed covariance matrix is singular,\nnumerical stability may be a problem"
 
-        Sigma_tilde = inv(self.W)  # Damn
+        self.Sigma_tilde = inv(self.W)  # Damn
 
         #Convert to float as its (1, 1) and Z must be a scalar
         self.Z = np.float64(Z_tilde)
         self.Y = Y_tilde
         self.YYT = np.dot(self.Y, self.Y.T)
-        self.covariance_matrix = Sigma_tilde
+        self.covariance_matrix = self.Sigma_tilde
         self.precision = 1 / np.diag(self.covariance_matrix)[:, None]
 
     def fit_full(self, K):
diff --git a/GPy/likelihoods/likelihood_functions.py b/GPy/likelihoods/likelihood_functions.py
index c759e15f..6e72b029 100644
--- a/GPy/likelihoods/likelihood_functions.py
+++ b/GPy/likelihoods/likelihood_functions.py
@@ -20,6 +20,16 @@ class likelihood_function:
     def __init__(self,location=0,scale=1):
         self.location = location
         self.scale = scale
+        self.log_concave = True
+
+    def _get_params(self):
+        return np.zeros(0)
+
+    def _get_param_names(self):
+        return []
+
+    def _set_params(self, p):
+        pass
 
 class probit(likelihood_function):
     """
@@ -149,12 +159,22 @@ class student_t(likelihood_function):
     d2ln p(yi|fi)_d2fifj
     """
     def __init__(self, deg_free, sigma=2):
+        super(student_t, self).__init__()
         self.v = deg_free
         self.sigma = sigma
-
-        #FIXME: This should be in the superclass
         self.log_concave = False
 
+    def _get_params(self):
+        return np.asarray(self.sigma)
+
+    def _get_param_names(self):
+        return ["t_noise_variance"]
+
+    def _set_params(self, x):
+        self.sigma = float(x)
+        #self.covariance_matrix = np.eye(self.N)*self._variance
+        #self.precision = 1./self._variance
+
     @property
     def variance(self, extra_data=None):
         return (self.v / float(self.v - 2)) * (self.sigma**2)
@@ -222,6 +242,40 @@ class student_t(likelihood_function):
         hess = ((self.v + 1)*(e**2 - self.v*(self.sigma**2))) / ((((self.sigma**2)*self.v) + e**2)**2)
         return np.squeeze(hess)
 
+    def d3link(self, y, f, extra_data=None):
+        """
+        Third order derivative link_function (log-likelihood ) at y given f f_j w.r.t f and f_j
+
+        $$\frac{-2(v+1)((f-y)^{3} - 3\sigma^{2}v(f-y))}{((f-y)^{2} + \sigma^{2}v)^{3}}$$
+        """
+        y = np.squeeze(y)
+        f = np.squeeze(f)
+        assert y.shape == f.shape
+        #NB f-y not y-f
+        e = f - y
+        d3link_d3f = (  (-2*(self.v + 1)*(e**3 - 3*(self.sigma**2)*self.v*e))
+                      / ((e**2 + (self.sigma**2)*self.v)**3)
+                     )
+        return d3link_d3f
+
+    def link_hess_grad_sigma(self, y, f, extra_data=None):
+        """
+        Gradient of the hessian w.r.t sigma parameter
+
+        $$\frac{2\sigma v(v+1)(\sigma^{2}v - 3(f-y)^2)}{((f-y)^{2} + \sigma^{2}v)^{3}}
+        """
+        y = np.squeeze(y)
+        f = np.squeeze(f)
+        assert y.shape == f.shape
+        e = y - f
+        hess_grad_sigma = (  (2*self.sigma*(self.v + 1)*((self.sigma**2)*self.v - 3*(e**2)))
+                           / ((e**2 + (self.sigma**2)*self.v)**3)
+                          )
+        return hess_grad_sigma
+
+    def _gradients(self, y, f, extra_data=None):
+        return [self.link_hess_grad_sigma] # list as we might learn many parameters
+
     def predictive_values(self, mu, var):
         """
         Compute  mean, and conficence interval (percentiles 5 and 95) of the prediction
diff --git a/GPy/models/GP.py b/GPy/models/GP.py
index cfda0cfe..1024b5ef 100644
--- a/GPy/models/GP.py
+++ b/GPy/models/GP.py
@@ -8,7 +8,7 @@ from .. import kern
 from ..core import model
 from ..util.linalg import pdinv,mdot
 from ..util.plot import gpplot,x_frame1D,x_frame2D, Tango
-from ..likelihoods import EP
+from ..likelihoods import EP, Laplace
 
 class GP(model):
     """
@@ -128,7 +128,19 @@ class GP(model):
 
         For the likelihood parameters, pass in alpha = K^-1 y
         """
-        return np.hstack((self.kern.dK_dtheta(dL_dK=self.dL_dK,X=self.X,slices1=self.Xslices,slices2=self.Xslices), self.likelihood._gradients(partial=np.diag(self.dL_dK))))
+        if isinstance(self.likelihood, Laplace):
+            dL_dthetaK_explicit = self.kern.dK_dtheta(dL_dK=self.dL_dK, X=self.X, slices1=self.Xslices, slices2=self.Xslices)
+            #Need to pass in a matrix of ones to get access to raw dK_dthetaK values without being chained
+            fake_dL_dKs = np.ones(self.dL_dK.shape)
+            dK_dthetaK = self.kern.dK_dtheta(dL_dK=fake_dL_dKs, X=self.X, slices1=self.Xslices, slices2=self.Xslices)
+
+            dL_dthetaK_implicit = self.likelihood._Kgradients(self.dL_dK, dK_dthetaK)
+            dL_dthetaK = dL_dthetaK_explicit + dL_dthetaK_implicit
+            dL_dthetaL = self.likelihood._gradients(partial=np.diag(self.dL_dK))
+        else:
+            dL_dthetaK = self.kern.dK_dtheta(dL_dK=self.dL_dK, X=self.X, slices1=self.Xslices, slices2=self.Xslices)
+            dL_dthetaL = self.likelihood._gradients(partial=np.diag(self.dL_dK))
+        return np.hstack((dL_dthetaK, dL_dthetaL))
 
     def _raw_predict(self,_Xnew,slices=None, full_cov=False):
         """
diff --git a/GPy/util/linalg.py b/GPy/util/linalg.py
index f88099a4..cb899397 100644
--- a/GPy/util/linalg.py
+++ b/GPy/util/linalg.py
@@ -14,6 +14,21 @@ import types
 #import scipy.lib.lapack.flapack
 import scipy as sp
 
+def det_ln_diag(A):
+    """
+    log determinant of a diagonal matrix
+    $$\ln |A| = \ln \prod{A_{ii}} = \sum{\ln A_{ii}}$$
+    """
+    return np.log(np.diagonal(A)).sum()
+
+def pddet(A):
+    """
+    Determinant of a positive definite matrix
+    """
+    L = cholesky(A)
+    logdetA = 2*sum(np.log(np.diag(L)))
+    return logdetA
+
 def trace_dot(a,b):
     """
     efficiently compute the trace of the matrix product of a and b
@@ -166,8 +181,8 @@ def PCA(Y, Q):
     """
     if not np.allclose(Y.mean(axis=0), 0.0):
         print "Y is not zero mean, centering it locally (GPy.util.linalg.PCA)"
-        
-        #Y -= Y.mean(axis=0) 
+
+        #Y -= Y.mean(axis=0)
 
     Z = linalg.svd(Y-Y.mean(axis=0), full_matrices = False)
     [X, W] = [Z[0][:,0:Q], np.dot(np.diag(Z[1]), Z[2]).T[:,0:Q]]