From a7169ab1ab771e567e45d6a11ae9e13b13f3c754 Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Mon, 1 Jul 2013 15:21:47 +0100
Subject: [PATCH] Fixed bug where B wasn't refering to current f location

---
 GPy/core/model.py                       |  3 +++
 GPy/examples/laplace_approximations.py  |  5 +++--
 GPy/likelihoods/Laplace.py              | 21 ++++++++++-----------
 GPy/likelihoods/likelihood_functions.py |  6 +++++-
 4 files changed, 21 insertions(+), 14 deletions(-)

diff --git a/GPy/core/model.py b/GPy/core/model.py
index 94202396..83a4a428 100644
--- a/GPy/core/model.py
+++ b/GPy/core/model.py
@@ -244,6 +244,9 @@ class model(parameterised):
         LL_gradients = self._transform_gradients(self._log_likelihood_gradients())
         prior_gradients = self._transform_gradients(self._log_prior_gradients())
         obj_grads = -LL_gradients - prior_gradients
+        print self
+        #self.checkgrad(verbose=1)
+        import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
         return obj_f, obj_grads
 
     def optimize(self, optimizer=None, start=None, **kwargs):
diff --git a/GPy/examples/laplace_approximations.py b/GPy/examples/laplace_approximations.py
index 24f2d88c..bb621424 100644
--- a/GPy/examples/laplace_approximations.py
+++ b/GPy/examples/laplace_approximations.py
@@ -100,7 +100,7 @@ def debug_student_t_noise_approx():
     Y = Y/Y.max()
 
     #Add student t random noise to datapoints
-    deg_free = 10000
+    deg_free = 1000
 
     real_sd = np.sqrt(real_var)
     print "Real noise std: ", real_sd
@@ -152,7 +152,7 @@ def debug_student_t_noise_approx():
     m.constrain_positive('t_noise_std')
     #m.constrain_positive('')
     m.ensure_default_constraints()
-    #m.constrain_fixed('t_noi', real_sd)
+    m.constrain_bounded('t_noi', 0.001, 10)
     #m['rbf_var'] = 0.20446332
     #m['rbf_leng'] = 0.85776241
     #m['t_noise'] = 0.667083294421005
@@ -168,6 +168,7 @@ def debug_student_t_noise_approx():
         plt.plot(X_full, Y_full)
         plt.ylim(-2.5, 2.5)
     print "Real noise std: ", real_sd
+    print "or Real noise std: ", real_stu_t_std
     return m
 
     #print "Clean student t, ncg"
diff --git a/GPy/likelihoods/Laplace.py b/GPy/likelihoods/Laplace.py
index e4652f27..4c9c67df 100644
--- a/GPy/likelihoods/Laplace.py
+++ b/GPy/likelihoods/Laplace.py
@@ -158,7 +158,6 @@ class Laplace(likelihood):
         self.Wi_K_i = self.W_12*self.Bi*self.W_12.T #same as rasms R
         ln_det_K_Wi__Bi = self.ln_I_KW_det + pddet(self.Sigma_tilde + self.K)
         l = self.likelihood_function.link_function(self.data, self.f_hat, extra_data=self.extra_data)
-        #print "fDf:{}   l:{}   detKWiBi:{}   W:{}   Wi:{}   Bi:{}   Ki:{}".format(fDf, l, ln_det_K_Wi__Bi, W.sum(), Wi.sum(), self.Bi.sum(), Ki.sum())
 
         y_Wi_Ki_i_y = mdot(Y_tilde.T, self.Wi_K_i, Y_tilde)
         Z_tilde = (+ self.NORMAL_CONST
@@ -199,14 +198,14 @@ class Laplace(likelihood):
         self.W = -self.likelihood_function.d2lik_d2f(self.data, self.f_hat, extra_data=self.extra_data)
 
         if not self.likelihood_function.log_concave:
-            self.W[self.W < 0] = 1e-8  # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
+            self.W[self.W < 0] = 1e-6  # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
                                        #If the likelihood is non-log-concave. We wan't to say that there is a negative variance
                                        #To cause the posterior to become less certain than the prior and likelihood,
                                        #This is a property only held by non-log-concave likelihoods
 
         #TODO: Could save on computation when using rasm by returning these, means it isn't just a "mode finder" though
-        #self.B, self.B_chol, self.W_12 = self._compute_B_statistics(self.K, self.W)
-        #self.Bi, _, _, B_det = pdinv(self.B)
+        self.B, self.B_chol, self.W_12 = self._compute_B_statistics(self.K, self.W)
+        self.Bi, _, _, B_det = pdinv(self.B)
 
         #Do the computation again at f to get Ki_f which is useful
         #b = self.W*self.f_hat + self.likelihood_function.dlik_df(self.data, self.f_hat, extra_data=self.extra_data)
@@ -305,14 +304,14 @@ class Laplace(likelihood):
             return -0.5*np.dot(a.T, f) + self.likelihood_function.link_function(self.data, f, extra_data=self.extra_data)
 
         difference = np.inf
-        epsilon = 1e-6
+        epsilon = 1e-10
         step_size = 1
         rs = 0
         i = 0
         while difference > epsilon and i < MAX_ITER and rs < MAX_RESTART:
             W = -self.likelihood_function.d2lik_d2f(self.data, f, extra_data=self.extra_data)
             if not self.likelihood_function.log_concave:
-                W[W < 0] = 0#1e-6     # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
+                W[W < 0] = 1e-6     # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
                                     # If the likelihood is non-log-concave. We wan't to say that there is a negative variance
                                     # To cause the posterior to become less certain than the prior and likelihood,
                                     # This is a property only held by non-log-concave likelihoods
@@ -335,13 +334,13 @@ class Laplace(likelihood):
             def inner_obj(step_size, old_a, da, K):
                 a = old_a + step_size*da
                 f = np.dot(K, a)
-                self.a = a
+                self.a = a # This is nasty, need to set something within an optimization though
                 self.f = f
                 return -obj(a, f)
 
             from functools import partial
             i_o = partial(inner_obj, old_a=old_a, da=da, K=self.K)
-            new_obj = sp.optimize.brent(i_o, tol=1e-4, maxiter=10)
+            new_obj = sp.optimize.brent(i_o, tol=1e-6, maxiter=10)
 
             #update_passed = False
             #while not update_passed:
@@ -373,8 +372,8 @@ class Laplace(likelihood):
             i += 1
 
         #print "Positive difference obj: ", np.float(difference)
-        #print "Iterations: {}, Step size reductions: {}, Final_difference: {}, step_size: {}".format(i, rs, difference, step_size)
+        print "Iterations: {}, Step size reductions: {}, Final_difference: {}, step_size: {}".format(i, rs, difference, step_size)
         #self.a = a
-        self.B, self.B_chol, self.W_12 = B, L, W_12
-        self.Bi, _, _, B_det = pdinv(self.B)
+        #self.B, self.B_chol, self.W_12 = B, L, W_12
+        #self.Bi, _, _, B_det = pdinv(self.B)
         return f
diff --git a/GPy/likelihoods/likelihood_functions.py b/GPy/likelihoods/likelihood_functions.py
index ebc87f56..57627198 100644
--- a/GPy/likelihoods/likelihood_functions.py
+++ b/GPy/likelihoods/likelihood_functions.py
@@ -195,8 +195,9 @@ class student_t(likelihood_function):
         e = y - f
         objective = (+ gammaln((self.v + 1) * 0.5)
                      - gammaln(self.v * 0.5)
-                     - np.log(self.sigma * np.sqrt(self.v * np.pi))
+                     - 0.5*np.log((self.sigma**2) * self.v * np.pi)
                      - (self.v + 1) * 0.5 * np.log(1 + (((e / self.sigma)**2) / self.v))
+                     #- (self.v + 1) * 0.5 * np.log(1 + (e**2)/(self.v*(self.sigma**2)))
                     )
         return np.sum(objective)
 
@@ -264,6 +265,7 @@ class student_t(likelihood_function):
         dlik_dsigma = ( - (1/self.sigma) +
                         ((1+self.v)*(e**2))/((self.sigma**3)*self.v*(1 + ((e**2) / ((self.sigma**2)*self.v)) ) )
                       )
+        #dlik_dsigma = (((self.v + 1)*(e**2))/((e**2) + self.v*(self.sigma**2))) - 1
         return dlik_dsigma
 
     def dlik_df_dstd(self, y, f, extra_data=None):
@@ -290,6 +292,8 @@ class student_t(likelihood_function):
         dlik_hess_dsigma = (  (2*self.sigma*self.v*(self.v + 1)*((self.sigma**2)*self.v - 3*(e**2))) /
                               ((e**2 + (self.sigma**2)*self.v)**3)
                            )
+        #dlik_hess_dsigma = ( 2*(self.v + 1)*self.v*(self.sigma**2)*((e**2) + (self.v*(self.sigma**2)) - 4*(e**2))
+                             #/ ((e**2 + (self.sigma**2)*self.v)**3) )
         return dlik_hess_dsigma
 
     def _gradients(self, y, f, extra_data=None):