From 787a038401ee959fbbd8bfe354c84c1d4cbd56fa Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Tue, 14 May 2013 16:23:18 +0100
Subject: [PATCH] Still getting closer to grads for likelihood

---
 GPy/examples/laplace_approximations.py  |  4 ++--
 GPy/likelihoods/Laplace.py              | 16 ++++++----------
 GPy/likelihoods/likelihood_functions.py |  4 ++--
 3 files changed, 10 insertions(+), 14 deletions(-)

diff --git a/GPy/examples/laplace_approximations.py b/GPy/examples/laplace_approximations.py
index 2054881c..eb725b53 100644
--- a/GPy/examples/laplace_approximations.py
+++ b/GPy/examples/laplace_approximations.py
@@ -95,10 +95,10 @@ def debug_student_t_noise_approx():
     m.constrain_positive('t_noi')
     #m.constrain_fixed('t_noise_variance', real_sd)
     m.update_likelihood_approximation()
-    #m.optimize('lbfgsb', messages=True, callback=m._update_params_callback)
-    m.optimize('scg', messages=True)
     print(m)
     return m
+    #m.optimize('lbfgsb', messages=True, callback=m._update_params_callback)
+    m.optimize('scg', messages=True)
     if plot:
         plt.suptitle('Student-t likelihood')
         plt.subplot(132)
diff --git a/GPy/likelihoods/Laplace.py b/GPy/likelihoods/Laplace.py
index 5b3e8f43..2af51f2b 100644
--- a/GPy/likelihoods/Laplace.py
+++ b/GPy/likelihoods/Laplace.py
@@ -201,24 +201,22 @@ class Laplace(likelihood):
         dL_dytil, dytil_dfhat = self._shared_gradients_components()
         #dfhat_dthetaL, dSigmai_dthetaL = self.likelihood_function._gradients(self.data, self.f_hat, self.extra_data) #FIXME: Shouldn't this have a implicit component aswell?
 
-        dlikelihood_dthetaL, d2likelihood_dthetaL = self.likelihood_function._gradients(self.data, self.f_hat, self.extra_data) #FIXME: Shouldn't this have a implicit component aswell?
+        dlikelihoodgrad_dthetaL, d2likelihood_dthetaL = self.likelihood_function._gradients(self.data, self.f_hat, self.extra_data) #FIXME: Shouldn't this have a implicit component aswell?
         dlikelihood_dfhat = self.likelihood_function.link_grad(self.data, self.f_hat, self.extra_data)
-        #dfhat_dthetaL_cyclic = 0 #FIXME: what is this? how can dfhat_dthetaL be used in the value of itself?
-        #dlikelihood_dthetaL_implicit = np.dot(dlikelihood_dfhat, dfhat_dthetaL_cyclic) # may need a sum over f
-        #dfhat_dthetaL = np.dot(self.K, (dlikelihood_dthetaL_explicit + dlikelihood_dthetaL_implicit)[:, None])
         #KW_I_i, _, _, _ = pdinv(np.dot(self.K, self.W) + np.eye(self.N))
         KW_I_i = self.Bi # could use self.B_chol??
-        dfhat_dthetaL = mdot(KW_I_i, (self.K, dlikelihood_dfhat))
+        dfhat_dthetaL = mdot(KW_I_i, (self.K, dlikelihoodgrad_dthetaL))
+        #dfhat_dthetaL = np.zeros(dfhat_dthetaL.shape)[:, None]
 
         dytil_dthetaL = np.dot(dytil_dfhat, dfhat_dthetaL)
 
         #FIXME: Careful the -D*0.5 in dL_d_K_sigma might need to be -0.5?
-        dL_dSigma = partial #Is actually but can't rename it because of naming convention... dL_d_K_Sigma
+        dL_dSigma = np.diagflat(partial) #Is actually but can't rename it because of naming convention... dL_d_K_Sigma
 
         Wi = np.diagonal(self.Sigma_tilde) #Convenience
         #-1 as we are looking at W which is -1*d2log p(y|f)
         #Can just hadamard product as diagonal matricies multiplied are just multiplying elements
-        dSigma_dthetaL_explicit = np.diagflat(-(Wi*(-1*d2likelihood_dthetaL)*Wi))
+        dSigma_dthetaL_explicit = np.diagflat(-1*(Wi*(-1*d2likelihood_dthetaL)*Wi))
 
         d3likelihood_d3fhat = self.likelihood_function.d3link(self.data, self.f_hat, self.extra_data)
         dWi_dfhat = np.diagflat(-1*Wi*(-1*d3likelihood_d3fhat)*Wi)
@@ -242,10 +240,8 @@ class Laplace(likelihood):
         #dL_dthetaL = 0 + np.dot(dL_dytil, dytil_dthetaL)# + np.dot(dL_dSigma, dSigma_dthetaL)
 
         dL_dthetaL_via_ytil = np.sum(np.dot(dL_dytil, dytil_dthetaL), axis=0)
-        dL_dthetaL_via_Sigma = np.sum(np.dot(dL_dSigma[:, None].T, dSigma_dthetaL), axis=0)
+        dL_dthetaL_via_Sigma = np.sum(np.sum(np.dot(dL_dSigma, dSigma_dthetaL), axis=0))
         dL_dthetaL = dL_dthetaL_via_ytil + dL_dthetaL_via_Sigma
-        dL_dthetaL_via_Sigma_old = np.sum(np.dot(dL_dSigma, dSigma_dthetaL), axis=0)
-        import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
 
         return np.squeeze(dL_dthetaL) #should be array of length *params-being optimized*, for student t just optimising 1 parameter, this is (1,)
 
diff --git a/GPy/likelihoods/likelihood_functions.py b/GPy/likelihoods/likelihood_functions.py
index 6eef9f33..1a9dac75 100644
--- a/GPy/likelihoods/likelihood_functions.py
+++ b/GPy/likelihoods/likelihood_functions.py
@@ -256,7 +256,7 @@ class student_t(likelihood_function):
         f = np.squeeze(f)
         assert y.shape == f.shape
         e = y - f
-        d3link_d3f = (  (2*(self.v + 1)*(e**3 - 3*(self.sigma**2)*self.v*e))
+        d3link_d3f = (  (2*(self.v + 1)*(-1*e)*(e**2 - 3*(self.sigma**2)*self.v))
                       / ((e**2 + (self.sigma**2)*self.v)**3)
                      )
         return np.squeeze(d3link_d3f)
@@ -286,7 +286,7 @@ class student_t(likelihood_function):
         f = np.squeeze(f)
         assert y.shape == f.shape
         e = y - f
-        grad_sigma = (  (2*self.sigma*self.v*(self.v + 1)*e)
+        grad_sigma = (  (-2*self.sigma*self.v*(self.v + 1)*e)
                       / ((self.v*(self.sigma**2) + e**2)**2)
                      )
         return np.squeeze(grad_sigma)