From f3b8dfb2225c8a25a0b753ec0e2f63b28cdec827 Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Mon, 3 Jun 2013 14:51:09 +0100
Subject: [PATCH] about to input new derivations for Z's...

---
 GPy/examples/laplace_approximations.py | 15 +++++++++++---
 GPy/likelihoods/Laplace.py             | 28 ++++++++++++++++----------
 GPy/models/GP.py                       | 17 ++++++++--------
 3 files changed, 37 insertions(+), 23 deletions(-)

diff --git a/GPy/examples/laplace_approximations.py b/GPy/examples/laplace_approximations.py
index 14ff44a0..ee71a950 100644
--- a/GPy/examples/laplace_approximations.py
+++ b/GPy/examples/laplace_approximations.py
@@ -143,11 +143,12 @@ def student_t_approx():
     Yc[10] += 100
     Yc[25] += 10
     Yc[23] += 10
+    Yc[26] += 1000
     Yc[24] += 10
     #Yc = Yc/Yc.max()
 
     #Add student t random noise to datapoints
-    deg_free = 1000000000000
+    deg_free = 10
     real_sd = np.sqrt(real_var)
     print "Real noise: ", real_sd
 
@@ -187,21 +188,25 @@ def student_t_approx():
     plt.subplot(211)
     m.plot()
     plt.plot(X_full, Y_full)
+    plt.title('Gaussian clean')
     print m
 
     #Corrupt
     print "Corrupt Gaussian"
     m = GPy.models.GP_regression(X, Yc, kernel=kernel2)
     m.ensure_default_constraints()
-    m.optimize()
+    #m.optimize()
     plt.subplot(212)
     m.plot()
     plt.plot(X_full, Y_full)
+    plt.title('Gaussian corrupt')
     print m
 
+    import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
+
     plt.figure(2)
     plt.suptitle('Student-t likelihood')
-    edited_real_sd = initial_var_guess #real_sd
+    edited_real_sd = real_sd #initial_var_guess
 
     print "Clean student t, rasm"
     t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd)
@@ -215,6 +220,7 @@ def student_t_approx():
     m.plot()
     plt.plot(X_full, Y_full)
     plt.ylim(-2.5, 2.5)
+    plt.title('Student-t rasm clean')
 
     print "Corrupt student t, rasm"
     t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd)
@@ -228,6 +234,7 @@ def student_t_approx():
     m.plot()
     plt.plot(X_full, Y_full)
     plt.ylim(-2.5, 2.5)
+    plt.title('Student-t rasm corrupt')
 
     print "Clean student t, ncg"
     t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd)
@@ -241,6 +248,7 @@ def student_t_approx():
     m.plot()
     plt.plot(X_full, Y_full)
     plt.ylim(-2.5, 2.5)
+    plt.title('Student-t ncg clean')
 
     print "Corrupt student t, ncg"
     t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd)
@@ -254,6 +262,7 @@ def student_t_approx():
     m.plot()
     plt.plot(X_full, Y_full)
     plt.ylim(-2.5, 2.5)
+    plt.title('Student-t ncg corrupt')
 
 
     ###with a student t distribution, since it has heavy tails it should work well
diff --git a/GPy/likelihoods/Laplace.py b/GPy/likelihoods/Laplace.py
index 74d37d48..45fddeaa 100644
--- a/GPy/likelihoods/Laplace.py
+++ b/GPy/likelihoods/Laplace.py
@@ -6,7 +6,10 @@ from numpy.linalg import cond
 from likelihood import likelihood
 from ..util.linalg import pdinv, mdot, jitchol, chol_inv, det_ln_diag, pddet
 from scipy.linalg.lapack import dtrtrs
+import random
 #import pylab as plt
+np.random.seed(50)
+random.seed(50)
 
 
 class Laplace(likelihood):
@@ -156,6 +159,7 @@ class Laplace(likelihood):
 
         Lt_W_i_Li = dtrtrs(Lt_W, Li, lower=False)[0]
         self.Wi__Ki_W = Lt_W_i_Li + np.eye(self.N)
+
         Y_tilde = np.dot(self.Wi__Ki_W, self.f_hat)
 
         #f.T(Ki + W)f
@@ -239,15 +243,15 @@ class Laplace(likelihood):
 
         self.ln_Ki_W_i_det = np.linalg.det(self.Ki_W_i)
 
+        #Do the computation again at f to get Ki_f which is useful
         b = self.W*self.f_hat + self.likelihood_function.dlik_df(self.data, self.f_hat, extra_data=self.extra_data)
-
         solve_chol = cho_solve((self.B_chol, True), np.dot(self.W_12*self.K, b))
-
         a = b - self.W_12*solve_chol
-
         self.Ki_f = a
+
         self.f_Ki_f = np.dot(self.f_hat.T, self.Ki_f)
         self.ln_K_det = pddet(self.K)
+        #_, _, _, self.ln_K_det = pdinv(self.K)
 
         self.ln_z_hat = (- 0.5*self.f_Ki_f
                          - 0.5*self.ln_K_det
@@ -296,7 +300,7 @@ class Laplace(likelihood):
             res = -1 * (--np.diag(self.likelihood_function.d2lik_d2f(self.data[:, 0], f, extra_data=self.extra_data)) - self.Ki)
             return np.squeeze(res)
 
-        f_hat = sp.optimize.fmin_ncg(obj, f, fprime=obj_grad, fhess=obj_hess)
+        f_hat = sp.optimize.fmin_ncg(obj, f, fprime=obj_grad, fhess=obj_hess, disp=False)
         return f_hat[:, None]
 
     def rasm_mode(self, K, MAX_ITER=500000, MAX_RESTART=50):
@@ -336,17 +340,19 @@ class Laplace(likelihood):
             grad = self.likelihood_function.dlik_df(self.data, f, extra_data=self.extra_data)
             #Find K_i_f
             b = W_f + grad
+            b = step_size*b
 
-            #a should be equal to Ki*f now so should be able to use it
-            c = np.dot(K, W_f) + f*(1-step_size) + step_size*np.dot(K, grad)
-
-            solve_L = cho_solve((L, True), W_12*c)
-
-            f = c - np.dot(K, W_12*solve_L)
+            #Need this to find the f we have a stepsize which we need to move in, rather than a full unit movement
+            #c = np.dot(K, W_f) + f*(1-step_size) + step_size*np.dot(K, grad)
+            #solve_L = cho_solve((L, True), W_12*c)
+            #f = c - np.dot(K, W_12*solve_L)
 
+            #FIXME: Can't we get rid of this? Don't we want to evaluate obj(c,f) and this is our new_obj?
+            #Why did I choose to evaluate the objective function at the new f with the old hessian? I'm sure there was a good reason,
+            #Document it!
             solve_L = cho_solve((L, True), W_12*np.dot(K, b))
-
             a = b - W_12*solve_L
+            f = np.dot(K, a)
 
             tmp_old_obj = old_obj
             old_obj = new_obj
diff --git a/GPy/models/GP.py b/GPy/models/GP.py
index 0ba20d7b..e4ed52ef 100644
--- a/GPy/models/GP.py
+++ b/GPy/models/GP.py
@@ -142,23 +142,22 @@ class GP(model):
         Note, we use the chain rule: dL_dtheta = dL_dK * d_K_dtheta
         """
         dL_dthetaK = self.kern.dK_dtheta(dL_dK=self.dL_dK, X=self.X)
-        print "dL_dthetaK before: ",dL_dthetaK
         if isinstance(self.likelihood, Laplace):
             #Reapproximate incase it hasnt been done...
-            if isinstance(self.likelihood, Laplace):
-                self.likelihood.fit_full(self.kern.K(self.X))
-                self.likelihood._set_params(self.likelihood._get_params())
+            self.likelihood.fit_full(self.kern.K(self.X))
+            self.likelihood._set_params(self.likelihood._get_params())
+            print self.kern._get_params()
 
             #Need to pass in a matrix of ones to get access to raw dK_dthetaK values without being chained
-            fake_dL_dKs = np.ones(self.dL_dK.shape) #FIXME: Check this is right...
+            #fake_dL_dKs = np.ones(self.dL_dK.shape) #FIXME: Check this is right...
             #fake_dL_dKs = np.eye(self.dL_dK.shape[0]) #FIXME: Check this is right...
 
             #BUG: THIS SHOULD NOT BE (1,num_k_params) matrix it should be (N,N,num_k_params)
-            dK_dthetaK = self.kern.dK_dtheta(dL_dK=fake_dL_dKs, X=self.X)
+            #dK_dthetaK = self.kern.dK_dtheta(dL_dK=fake_dL_dKs, X=self.X)
 
-            dL_dthetaK = self.likelihood._Kgradients(dK_dthetaK=dK_dthetaK)
-            dL_dthetaL = self.likelihood._gradients(partial=np.diag(self.dL_dK))
-            print "dL_dthetaK after: ",dL_dthetaK
+            #dL_dthetaK = self.likelihood._Kgradients(dK_dthetaK=dK_dthetaK)
+            dL_dthetaL = 0 #self.likelihood._gradients(partial=np.diag(self.dL_dK))
+            #print "dL_dthetaK after: ",dL_dthetaK
             #print "Stacked dL_dthetaK, dL_dthetaL: ", np.hstack((dL_dthetaK, dL_dthetaL))
         else:
             dL_dthetaL = self.likelihood._gradients(partial=np.diag(self.dL_dK))