From de689fa8e91928b7fc2d02f56d4eca14d82eaafd Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Wed, 19 Jun 2013 12:00:00 +0100
Subject: [PATCH] Now gradchecks everytime but student_t fit is bad, noise is
 underestimated by a long way

---
 GPy/examples/laplace_approximations.py  | 18 +++++++++--------
 GPy/likelihoods/Laplace.py              | 27 ++++++++++++++++---------
 GPy/likelihoods/likelihood_functions.py | 16 +--------------
 GPy/models/GP.py                        | 12 -----------
 4 files changed, 29 insertions(+), 44 deletions(-)

diff --git a/GPy/examples/laplace_approximations.py b/GPy/examples/laplace_approximations.py
index 5120dfb5..84527d08 100644
--- a/GPy/examples/laplace_approximations.py
+++ b/GPy/examples/laplace_approximations.py
@@ -39,28 +39,28 @@ def debug_student_t_noise_approx():
     plot = False
     real_var = 0.1
     #Start a function, any function
-    X = np.linspace(0.0, 10.0, 15)[:, None]
+    X = np.linspace(0.0, 10.0, 50)[:, None]
     #X = np.array([0.5])[:, None]
     Y = np.sin(X) + np.random.randn(*X.shape)*real_var
 
-    X_full = np.linspace(0.0, 10.0, 15)[:, None]
+    X_full = np.linspace(0.0, 10.0, 50)[:, None]
     Y_full = np.sin(X_full)
 
     Y = Y/Y.max()
 
     #Add student t random noise to datapoints
-    deg_free = 10000
+    deg_free = 1000
     real_sd = np.sqrt(real_var)
-    print "Real noise: ", real_sd
+    print "Real noise std: ", real_sd
 
-    initial_var_guess = 0.02
+    initial_var_guess = 0.3
     #t_rv = t(deg_free, loc=0, scale=real_var)
     #noise = t_rvrvs(size=Y.shape)
     #Y += noise
 
     plt.close('all')
     # Kernel object
-    kernel1 = GPy.kern.rbf(X.shape[1])
+    kernel1 = GPy.kern.rbf(X.shape[1]) + GPy.kern.white(X.shape[1])
     kernel2 = kernel1.copy()
     kernel3 = kernel1.copy()
     kernel4 = kernel1.copy()
@@ -83,22 +83,24 @@ def debug_student_t_noise_approx():
         #plt.plot(X_full, Y_full)
     #print m
 
-    #edited_real_sd = initial_var_guess #real_sd
+    edited_real_sd = initial_var_guess #real_sd
     edited_real_sd = real_sd
 
     print "Clean student t, rasm"
     t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd)
     stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, rasm=True)
     m = GPy.models.GP(X, stu_t_likelihood, kernel6)
+    m['white'] = 1e-3
     #m.constrain_positive('rbf')
     #m.constrain_fixed('rbf_v', 1.0898)
     #m.constrain_fixed('rbf_l', 1.8651)
     #m.constrain_fixed('t_noise_variance', real_sd)
     m.constrain_positive('rbf')
+    m.constrain_positive('t_noise')
     #m.constrain_fixed('t_noi', real_sd)
     m.ensure_default_constraints()
     m.update_likelihood_approximation()
-    m.optimize(messages=True)
+    #m.optimize(messages=True)
     print(m)
     #return m
     #m.optimize('lbfgsb', messages=True, callback=m._update_params_callback)
diff --git a/GPy/likelihoods/Laplace.py b/GPy/likelihoods/Laplace.py
index a8347345..5b1a814a 100644
--- a/GPy/likelihoods/Laplace.py
+++ b/GPy/likelihoods/Laplace.py
@@ -84,12 +84,13 @@ class Laplace(likelihood):
 
         #Implicit
         impl = mdot(dlp, dL_dfhat.T, I_KW_i)
-        expl_a = - mdot(self.Ki_f, self.Ki_f.T)
+        expl_a = mdot(self.Ki_f, self.Ki_f.T)
         expl_b = Wi_K_i
-        expl = 0.5*expl_a - 0.5*expl_b
+        expl = 0.5*expl_a + 0.5*expl_b
         dL_dthetaK_exp = dK_dthetaK(expl, X)
         dL_dthetaK_imp = dK_dthetaK(impl, X)
-        dL_dthetaK = -(dL_dthetaK_imp + dL_dthetaK_exp)
+        #print "dL_dthetaK_exp: {}     dL_dthetaK_implicit: {}".format(dL_dthetaK_exp, dL_dthetaK_imp)
+        dL_dthetaK = dL_dthetaK_imp + dL_dthetaK_exp
 
         #dL_dthetaK = np.zeros(dK_dthetaK.shape)
         #for thetaK_i, dK_dthetaK_i in enumerate(dK_dthetaK):
@@ -117,10 +118,12 @@ class Laplace(likelihood):
             #dL_dthetaL[thetaL_i] = np.sum(dlik_dthetaL[thetaL_i]) - 0.5*np.trace(np.dot(Ki_W_i.T, np.diagflat(dlik_hess_dthetaL[thetaL_i])))
             #dL_dthetaL[thetaL_i] = np.sum(dlik_dthetaL[thetaL_i]) + 0.5*np.dot(Ki_W_i.T, dlik_hess_dthetaL[thetaL_i][:, None])
             #                                               might be +
-            dL_dthetaL[thetaL_i] = np.sum(dlik_dthetaL[thetaL_i]) - 0.5*np.dot(np.diag(self.Ki_W_i), dlik_hess_dthetaL[thetaL_i])
+            dL_dthetaL_exp = np.sum(dlik_dthetaL[thetaL_i]) - 0.5*np.dot(np.diag(self.Ki_W_i), dlik_hess_dthetaL[thetaL_i])
             #Implicit
             df_hat_dthetaL = mdot(I_KW_i, self.K, dlik_grad_dthetaL[thetaL_i])
-            dL_dthetaL[thetaL_i] += np.dot(dL_dfhat.T, df_hat_dthetaL)
+            dL_dthetaL_imp = np.dot(dL_dfhat.T, df_hat_dthetaL)
+            #print "dL_dthetaL_exp: {}     dL_dthetaL_implicit: {}".format(dL_dthetaL_exp, dL_dthetaL_imp)
+            dL_dthetaL[thetaL_i] = dL_dthetaL_imp + dL_dthetaL_exp
 
         return dL_dthetaL #should be array of length *params-being optimized*, for student t just optimising 1 parameter, this is (1,)
 
@@ -180,14 +183,20 @@ class Laplace(likelihood):
         W = np.diagflat(self.W)
         Wi = self.Sigma_tilde
         W12i = np.sqrt(Wi)
-        D = Ki - mdot((Ki + W), W12i, self.Bi, W12i, (Ki + W))
-        fDf = mdot(self.f_hat.T, D, self.f_hat)
+        #D = Ki - mdot((Ki + W), W12i, self.Bi, W12i, (Ki + W))
+        #fDf = mdot(self.f_hat.T, D, self.f_hat)
         l = self.likelihood_function.link_function(self.data, self.f_hat, extra_data=self.extra_data)
+        #print "fDf:{}   l:{}   detKWiBi:{}   W:{}   Wi:{}   Bi:{}   Ki:{}".format(fDf, l, ln_det_K_Wi__Bi, W.sum(), Wi.sum(), self.Bi.sum(), Ki.sum())
+
+        y_Wi_Ki_i_y = mdot(Y_tilde.T, pdinv(self.K + Wi)[0], Y_tilde)
         Z_tilde = (+ self.NORMAL_CONST
                    + l
                    + 0.5*ln_det_K_Wi__Bi
-                   - 0.5*fDf
+                   #- 0.5*fDf
+                   - 0.5*self.f_Ki_f
+                   + 0.5*y_Wi_Ki_i_y
                   )
+        #print "Ztilde: {}".format(Z_tilde)
 
         #Convert to float as its (1, 1) and Z must be a scalar
         self.Z = np.float64(Z_tilde)
@@ -316,7 +325,7 @@ class Laplace(likelihood):
             #f_old = f.copy()
             W = -self.likelihood_function.d2lik_d2f(self.data, f, extra_data=self.extra_data)
             if not self.likelihood_function.log_concave:
-                W[W < 0] = 1e-6     # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
+                W[W < 0] = 1e-5     # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
                                     # If the likelihood is non-log-concave. We wan't to say that there is a negative variance
                                     # To cause the posterior to become less certain than the prior and likelihood,
                                     # This is a property only held by non-log-concave likelihoods
diff --git a/GPy/likelihoods/likelihood_functions.py b/GPy/likelihoods/likelihood_functions.py
index c3aee835..041b59bd 100644
--- a/GPy/likelihoods/likelihood_functions.py
+++ b/GPy/likelihoods/likelihood_functions.py
@@ -170,7 +170,7 @@ class student_t(likelihood_function):
         return np.asarray(self.sigma)
 
     def _get_param_names(self):
-        return ["t_noise_variance"]
+        return ["t_noise_std"]
 
     def _set_params(self, x):
         self.sigma = float(x)
@@ -191,8 +191,6 @@ class student_t(likelihood_function):
         :returns: float(likelihood evaluated for this point)
 
         """
-        #y = np.squeeze(y)
-        #f = np.squeeze(f)
         assert y.shape == f.shape
 
         e = y - f
@@ -215,8 +213,6 @@ class student_t(likelihood_function):
         :returns: gradient of likelihood evaluated at points
 
         """
-        #y = np.squeeze(y)
-        #f = np.squeeze(f)
         assert y.shape == f.shape
         e = y - f
         grad = ((self.v + 1) * e) / (self.v * (self.sigma**2) + (e**2))
@@ -237,8 +233,6 @@ class student_t(likelihood_function):
         :extra_data: extra_data which is not used in student t distribution
         :returns: array which is diagonal of covariance matrix (second derivative of likelihood evaluated at points)
         """
-        #y = np.squeeze(y)
-        #f = np.squeeze(f)
         assert y.shape == f.shape
 
         e = y - f
@@ -251,8 +245,6 @@ class student_t(likelihood_function):
 
         $$\frac{d^{3}p(y_{i}|f_{i})}{d^{3}f} = \frac{-2(v+1)((y_{i} - f_{i})^3 - 3(y_{i} - f_{i}) \sigma^{2} v))}{((y_{i} - f_{i}) + \sigma^{2} v)^3}$$
         """
-        #y = np.squeeze(y)
-        #f = np.squeeze(f)
         assert y.shape == f.shape
         e = y - f
         d3lik_d3f = ( (2*(self.v + 1)*(-e)*(e**2 - 3*self.v*(self.sigma**2))) /
@@ -269,8 +261,6 @@ class student_t(likelihood_function):
 
         $$\frac{dp(y_{i}|f_{i})}{d\sigma} = -\frac{1}{\sigma} + \frac{(1+v)(y_{i}-f_{i})^2}{\sigma^3 v(1 + \frac{1}{v}(\frac{(y_{i} - f_{i})}{\sigma^2})^2)}$$
         """
-        #y = np.squeeze(y)
-        #f = np.squeeze(f)
         assert y.shape == f.shape
         e = y - f
         dlik_dsigma = ( - (1/self.sigma) +
@@ -284,8 +274,6 @@ class student_t(likelihood_function):
 
         $$\frac{d}{d\sigma}(\frac{dp(y_{i}|f_{i})}{df}) = \frac{-2\sigma v(v + 1)(y_{i}-f_{i})}{(y_{i}-f_{i})^2 + \sigma^2 v)^2}$$
         """
-        #y = np.squeeze(y)
-        #f = np.squeeze(f)
         assert y.shape == f.shape
         e = y - f
         dlik_grad_dsigma = ((-2*self.sigma*self.v*(self.v + 1)*e)
@@ -299,8 +287,6 @@ class student_t(likelihood_function):
 
         $$\frac{d}{d\sigma}(\frac{d^{2}p(y_{i}|f_{i})}{d^{2}f}) = \frac{2\sigma v(v + 1)(\sigma^2 v - 3(y-f)^2)}{((y-f)^2 + \sigma^2 v)^3}$$
         """
-        #y = np.squeeze(y)
-        #f = np.squeeze(f)
         assert y.shape == f.shape
         e = y - f
         dlik_hess_dsigma = (  (2*self.sigma*self.v*(self.v + 1)*((self.sigma**2)*self.v - 3*(e**2))) /
diff --git a/GPy/models/GP.py b/GPy/models/GP.py
index d56ee86f..636ebba0 100644
--- a/GPy/models/GP.py
+++ b/GPy/models/GP.py
@@ -145,18 +145,6 @@ class GP(model):
         self.likelihood._set_params(self.likelihood._get_params())
         dL_dthetaK = self.kern.dK_dtheta(dL_dK=self.dL_dK, X=self.X)
         if isinstance(self.likelihood, Laplace):
-            #Reapproximate incase it hasnt been done...
-            self.likelihood.fit_full(self.kern.K(self.X))
-            self.likelihood._set_params(self.likelihood._get_params())
-            print self.kern._get_params()
-
-            #Need to pass in a matrix of ones to get access to raw dK_dthetaK values without being chained
-            #fake_dL_dKs = np.ones(self.dL_dK.shape) #FIXME: Check this is right...
-            #fake_dL_dKs = np.eye(self.dL_dK.shape[0]) #FIXME: Check this is right...
-
-            #BUG: THIS SHOULD NOT BE (1,num_k_params) matrix it should be (N,N,num_k_params)
-            #dK_dthetaK = self.kern.dK_dtheta(dL_dK=fake_dL_dKs, X=self.X)
-
             dK_dthetaK = self.kern.dK_dtheta
             dL_dthetaK = self.likelihood._Kgradients(dK_dthetaK, self.X)
             dL_dthetaL = self.likelihood._gradients(partial=np.diag(self.dL_dK))