From 117c377d13efe81b2df567936ff48e85f918efcd Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Wed, 29 May 2013 14:02:03 +0100
Subject: [PATCH] Ripped out all things Laplace parameter estimation, starting
 again with new tactic

---
 GPy/likelihoods/Laplace.py | 175 +------------------------------------
 GPy/models/GP.py           |   8 +-
 2 files changed, 4 insertions(+), 179 deletions(-)

diff --git a/GPy/likelihoods/Laplace.py b/GPy/likelihoods/Laplace.py
index 666fa227..69c0876b 100644
--- a/GPy/likelihoods/Laplace.py
+++ b/GPy/likelihoods/Laplace.py
@@ -79,187 +79,18 @@ class Laplace(likelihood):
         return (self._Kgradients(dL_d_K_Sigma, dK_dthetaK), self._gradients(dL_d_K_Sigma))
 
     def _shared_gradients_components(self):
-        Ki, _, _, _ = pdinv(self.K)
-
-        #Y__KS_i = np.dot(self.Y.T, inv(self.K+self.Sigma_tilde))
-        #dL_dytil = -0.5*Y__KS_i #or *0.5? Shouldn't this be -y*R
-        #dL_dytil = -0.5*np.trace(np.dot(inv(self.K+self.Sigma_tilde), (np.dot(self.Y, self.Y.T) + self.Y.T)))
-        #dL_dytil_simple_term = -0.5*np.dot(inv(self.K+self.Sigma_tilde),
-        #dL_dytil_simple_term = -np.dot(self.Y.T, inv(self.K+self.Sigma_tilde), self.Y)
-        c = inv(self.K+self.Sigma_tilde)
-        dL_dytil_simple_term = -0.5*np.diag(2*np.dot(c, self.Y))
-
-        P = np.diagflat(1/np.dot(Ki, self.f_hat))
-        K_Wi_i = inv(self.K+self.Sigma_tilde)
-
-        dL_dytil_difficult_term = np.diag(( -0.5*(np.dot(self.K + self.Sigma_tilde, P))
-                                            +0.5*mdot(K_Wi_i, self.Y, self.Y.T, K_Wi_i, P)
-                                           ) * np.eye(self.N))
-        dL_dytil = dL_dytil_simple_term + dL_dytil_difficult_term
-        dL_dytil = dL_dytil.reshape(1, self.N)
-
-        d3likelihood_d3fhat = self.likelihood_function.d3link(self.data, self.f_hat, self.extra_data)
-        Wi = np.diagonal(self.Sigma_tilde) #Convenience
-        #Can just hadamard product as diagonal matricies multiplied are just multiplying elements
-        dWi_dfhat = np.diagflat(-1*Wi*(-1*d3likelihood_d3fhat)*Wi)
-
-
-        #Wi(Ki + W) = Wi__Ki_W using the last K prior given to fit_full
-        #dytil_dfhat_explicit = self.Wi__Ki_W
-        #dytil_dfhat = dytil_dfhat_explicit + dytil_dfhat_implicit
-        #dytil_dfhat1 = np.dot(self.Sigma_tilde, Ki) + np.eye(self.N) # or self.Wi__Ki_W? Theyre the same basically
-
-        dytil_dfhat = - np.diagflat(np.dot(dWi_dfhat, np.dot(Ki, self.f_hat))) + np.dot(self.Sigma_tilde, Ki) + np.eye(self.N)
-        self.dytil_dfhat = dytil_dfhat
-        #dytil_dfhat = np.eye(dytil_dfhat.shape[0])
-        self.dL_dfhat = np.dot(dL_dytil, dytil_dfhat) #FIXME: Purely for checkgradding....
-        return dL_dytil, dytil_dfhat
 
     def _Kgradients(self, dL_d_K_Sigma, dK_dthetaK):
         """
-                           #explicit                #implicit                     #implicit
-        dL_dtheta_K = (dL_dK * dK_dthetaK) + (dL_dytil * dytil_dthetaK) + (dL_dSigma * dSigma_dthetaK)
-        :param dL_d_K_Sigma: Derivative of marginal with respect to K_prior+Sigma_tilde (posterior covariance)
-        :param dK_dthetaK: explcit derivative of kernel with respect to its hyper paramers
-        :returns: dL_dthetaK - gradients of marginal likelihood w.r.t changes in K hyperparameters
+        Gradients with respect to prior kernel parameters
         """
-        dL_dytil, dytil_dfhat = self._shared_gradients_components()
-
-        #dSigma_dfhat = -np.dot(self.Sigma_tilde, np.dot(d3phi_d3fhat, self.Sigma_tilde))
-
-        #print "Computing K gradients"
-        #print "dytil_dfhat: ", np.mean(dytil_dfhat)
-        #I = np.eye(self.N)
-        #C = np.dot(self.K, self.W)
-        #A = I + C
-        #plt.imshow(A)
-        #plt.show()
-
-        #I_KW_i, _, _, _ = pdinv(A) #FIXME: WHY SO MUCH JITTER?!
-        #B = I + w12*K*w12
-        I_KW_i = self.Bi # could use self.B_chol??
-
-        #FIXME: Careful dK_dthetaK is not the derivative with respect to the marginal just prior K!
-        #Derivative for each f dimension, for each of K's hyper parameters
-        dfhat_dthetaK = np.zeros((self.f_hat.shape[0], dK_dthetaK.shape[0]))
-        grad = self.likelihood_function.link_grad(self.data, self.f_hat, self.extra_data)
-        for ind_j, thetaj in enumerate(dK_dthetaK):
-            #dfhat_dthetaK[:, ind_j] = np.dot(thetaj, grad) - np.dot(self.K, np.dot(I_KW_i, np.dot(thetaj, grad)))
-            dfhat_dthetaK[:, ind_j] = np.dot(I_KW_i, thetaj*grad)
-
-        print "dytil_dfhat: ", np.mean(dytil_dfhat), np.std(dytil_dfhat)
-        print "dfhat_dthetaK: ", np.mean(dfhat_dthetaK), np.std(dfhat_dthetaK)
-        dytil_dthetaK = np.dot(dytil_dfhat, dfhat_dthetaK) # should be (D,thetaK)
-        print "dytil_dthetaK: ", np.mean(dytil_dthetaK), np.std(dytil_dthetaK)
-        print "\n"
-
-        #FIXME: Careful the -D*0.5 in dL_d_K_sigma might need to be -0.5?
-        dL_dSigma = dL_d_K_Sigma
-        #d3phi_d3fhat = self.likelihood_function.d3link(self.data, self.f_hat, self.extra_data)
-                     #explicit           #implicit
-        #dSigmai_dthetaK = 0 + np.dot(d3phi_d3fhat, dfhat_dthetaK)
-        #dSigma_dthetaK = np.zeros((self.f_hat.shape[0], self.f_hat.shape[0], dK_dthetaK.shape[0]))
-
-        d3likelihood_d3fhat = self.likelihood_function.d3link(self.data, self.f_hat, self.extra_data)
-        Wi = np.diagonal(self.Sigma_tilde) #Convenience
-        dSigma_dthetaK_explicit = 0
-        #Can just hadamard product as diagonal matricies multiplied are just multiplying elements
-        dWi_dfhat = np.diagflat(-1*Wi*(-1*d3likelihood_d3fhat)*Wi)
-        #dSigma_dthetaK_implicit = -np.sum(np.dot(dWi_dfhat, dfhat_dthetaK), axis=0)
-        dSigma_dthetaK_implicit = np.dot(dWi_dfhat, dfhat_dthetaK)
-        dSigma_dthetaK = dSigma_dthetaK_explicit + dSigma_dthetaK_implicit
-        #dSigma_dthetaK = 0 + np.dot(, dfhat_dthetaK)
-        #for ind_j, dSigmai_dthetaj in enumerate(dSigmai_dthetaK):
-            #dSigma_dthetaK_explicit = 0
-            #dSigma_dthetaK_implicit = -np.dot(Wi, dW_dfhat
-            #dSigma_dthetaK[:, :, ind_j] = -np.dot(self.Sigma_tilde, dSigmai_dthetaj*self.Sigma_tilde)
-
-        #FIXME: Won't handle multi dimensional data
-        dL_dthetaK_via_ytil = np.sum(np.dot(dL_dytil, dytil_dthetaK), axis=0)
-        dL_dthetaK_via_Sigma = np.sum(np.dot(dL_dSigma, dSigma_dthetaK), axis=0)
-        dL_dthetaK_implicit = dL_dthetaK_via_ytil + dL_dthetaK_via_Sigma
-
-        print "dL_dytil: ", np.mean(dL_dytil), np.std(dL_dytil)
-        print "dytil_dthetaK: ", np.mean(dytil_dthetaK), np.std(dytil_dthetaK)
-        print "dL_dthetaK_via_ytil: ", dL_dthetaK_via_ytil
-        print "\n"
-        print "dL_dSigma: ", np.mean(dL_dSigma), np.std(dL_dSigma)
-        print "dSigma_dthetaK: ", np.mean(dSigma_dthetaK), np.std(dSigma_dthetaK)
-        print "dL_dthetaK_via_Sigma: ", dL_dthetaK_via_Sigma
-        print "\n"
-        print "dL_dthetaK_implicit: ", dL_dthetaK_implicit
-
-        return np.squeeze(dL_dthetaK_implicit)
+        return dL_dthetaK
 
     def _gradients(self, partial):
         """
         Gradients with respect to likelihood parameters
-
-        Complicated, it differs for parameters of the kernel \theta_{K}, and
-        parameters of the likelihood, \theta_{L}
-
-        dL_dtheta_K = (dL_dK * dK_dthetaK) + (dL_dytil * dytil_dthetaK) + (dL_dSigma * dSigma_dthetaK)
-        dL_dtheta_L = (dL_dK * dK_dthetaL) + (dL_dytil * dytil_dthetaL) + (dL_dSigma * dSigma_dthetaL)
-        dL_dK*dK_dthetaL = 0
-
-        dytil_dthetaX = dytil_dfhat * dfhat_dthetaX
-        dytil_dfhat = Sigma*Ki + I
-
-        fhat = K*log_p(y|fhat)                                          from rasm p125
-        dfhat_dthetaK = (I + KW)i * dK_dthetaK * log_p(y|fhat)          from rasm p125
-
-        dSigma_dthetaX = dWi_dthetaX = -Wi * dW_dthetaX * Wi
-        dW_dthetaX = d_dthetaX[d2phi_d2fhat]
-        d2phi_d2fhat = Hessian function of likelihood
-
-        partial = dL_d_K_Sigma
         """
-        dL_dytil, dytil_dfhat = self._shared_gradients_components()
-        #dfhat_dthetaL, dSigmai_dthetaL = self.likelihood_function._gradients(self.data, self.f_hat, self.extra_data) #FIXME: Shouldn't this have a implicit component aswell?
-
-        dlikelihoodgrad_dthetaL, d2likelihood_dthetaL = self.likelihood_function._gradients(self.data, self.f_hat, self.extra_data) #FIXME: Shouldn't this have a implicit component aswell?
-        dlikelihood_dfhat = self.likelihood_function.link_grad(self.data, self.f_hat, self.extra_data)
-        KW_I_i, _, _, _ = pdinv(np.dot(self.K, self.W) + np.eye(self.N))
-        #KW_I_i = self.Bi # could use self.B_chol??
-        dfhat_dthetaL = mdot(KW_I_i, (self.K, dlikelihoodgrad_dthetaL))
-        dfhat_dthetaL = np.zeros(dfhat_dthetaL.shape)[:, None]
-
-        dytil_dthetaL = np.dot(dytil_dfhat, dfhat_dthetaL)
-
-        #FIXME: Careful the -D*0.5 in dL_d_K_sigma might need to be -0.5?
-        dL_dSigma = np.diagflat(partial) #Is actually but can't rename it because of naming convention... dL_d_K_Sigma
-
-        Wi = np.diagonal(self.Sigma_tilde) #Convenience
-        #-1 as we are looking at W which is -1*d2log p(y|f)
-        #Can just hadamard product as diagonal matricies multiplied are just multiplying elements
-        dSigma_dthetaL_explicit = np.diagflat(-1*(Wi*(-1*d2likelihood_dthetaL)*Wi))
-
-        d3likelihood_d3fhat = self.likelihood_function.d3link(self.data, self.f_hat, self.extra_data)
-        dWi_dfhat = np.diagflat(-1*Wi*(-1*d3likelihood_d3fhat)*Wi)
-        dSigma_dthetaL_implicit = np.dot(dWi_dfhat, dfhat_dthetaL)
-        dSigma_dthetaL = dSigma_dthetaL_explicit + dSigma_dthetaL_implicit
-
-        #dSigmai_dthetaL = self.likelihood_function._gradients(self.data, self.f_hat, self.extra_data) #FIXME: Shouldn't this have a implicit component aswell?
-        #Derivative for each f dimension, for each of K's hyper parameters
-        #dSigma_dthetaL = np.empty((self.N, len(self.likelihood_function._get_param_names())))
-        #for ind_l, dSigmai_dtheta_l in enumerate(dSigmai_dthetaL.T):
-            #dSigma_dthetaL[:, ind_l] = -mdot(self.Sigma_tilde,
-                                             #dSigmai_dtheta_l, # Careful, shouldn't this be (N, 1)?
-                                             #self.Sigma_tilde
-                                             #)
-
-        #TODO: This is Wi*A*Wi, can be more numerically stable with a trick
-        #dSigma_dthetaL = -mdot(self.Sigma_tilde, dSigmai_dthetaL, self.Sigma_tilde)
-
-        #dytil_dthetaL = dytil_dfhat*dfhat_dthetaL
-        #dytil_dthetaL = np.dot(dytil_dfhat, dfhat_dthetaL)
-        #dL_dthetaL = 0 + np.dot(dL_dytil, dytil_dthetaL)# + np.dot(dL_dSigma, dSigma_dthetaL)
-
-        dL_dthetaL_via_ytil = np.sum(np.dot(dL_dytil, dytil_dthetaL), axis=0)
-        dL_dthetaL_via_Sigma = np.sum(np.sum(np.dot(dL_dSigma, dSigma_dthetaL), axis=0))
-        dL_dthetaL = dL_dthetaL_via_ytil + dL_dthetaL_via_Sigma
-
-        return np.squeeze(dL_dthetaL) #should be array of length *params-being optimized*, for student t just optimising 1 parameter, this is (1,)
+        return dL_dthetaL #should be array of length *params-being optimized*, for student t just optimising 1 parameter, this is (1,)
 
     def _compute_GP_variables(self):
         """
diff --git a/GPy/models/GP.py b/GPy/models/GP.py
index 17e2a1b1..da379eb1 100644
--- a/GPy/models/GP.py
+++ b/GPy/models/GP.py
@@ -150,14 +150,8 @@ class GP(model):
             fake_dL_dKs = np.eye(self.dL_dK.shape[0]) #FIXME: Check this is right...
             dK_dthetaK = self.kern.dK_dtheta(dL_dK=fake_dL_dKs, X=self.X)
 
-            #We need the dL_dK where K is equal to the prior K, not K+Sigma as is the case now
-            dL_dthetaK_implicit = self.likelihood._Kgradients(dL_d_K_Sigma=self.dL_dK, dK_dthetaK=dK_dthetaK)
-            dL_dthetaK = dL_dthetaK_explicit + dL_dthetaK_implicit
-
-            #print "dL_dthetaK_explicit: {dldkx}     dL_dthetaK_implicit: {dldki}        dL_dthetaK: {dldk}".format(dldkx=dL_dthetaK_explicit, dldki=dL_dthetaK_implicit, dldk=dL_dthetaK)
-
+            dL_dthetaK = self.likelihood._Kgradients(dL_d_K_Sigma=self.dL_dK, dK_dthetaK=dK_dthetaK)
             dL_dthetaL = self.likelihood._gradients(partial=np.diag(self.dL_dK))
-            #print "dL_dthetaL: ", dL_dthetaL
             print "Stacked dL_dthetaK, dL_dthetaL: ", np.hstack((dL_dthetaK, dL_dthetaL))
         else:
             dL_dthetaL = self.likelihood._gradients(partial=np.diag(self.dL_dK))