Now gradchecks everytime but student_t fit is bad, noise is underestimated by a long way

2026-07-02 16:01:03 +02:00 · 2013-06-19 12:00:00 +01:00 · 2013-06-19 12:00:00 +01:00 · de689fa8e9
commit de689fa8e9
parent ac461e1b2a
4 changed files with 29 additions and 44 deletions
--- a/GPy/examples/laplace_approximations.py
+++ b/GPy/examples/laplace_approximations.py
@ -39,28 +39,28 @@ def debug_student_t_noise_approx():
    plot = False
    real_var = 0.1
    #Start a function, any function
-    X = np.linspace(0.0, 10.0, 15)[:, None]
+    X = np.linspace(0.0, 10.0, 50)[:, None]
    #X = np.array([0.5])[:, None]
    Y = np.sin(X) + np.random.randn(*X.shape)*real_var

-    X_full = np.linspace(0.0, 10.0, 15)[:, None]
+    X_full = np.linspace(0.0, 10.0, 50)[:, None]
    Y_full = np.sin(X_full)

    Y = Y/Y.max()

    #Add student t random noise to datapoints
-    deg_free = 10000
+    deg_free = 1000
    real_sd = np.sqrt(real_var)
-    print "Real noise: ", real_sd
+    print "Real noise std: ", real_sd

-    initial_var_guess = 0.02
+    initial_var_guess = 0.3
    #t_rv = t(deg_free, loc=0, scale=real_var)
    #noise = t_rvrvs(size=Y.shape)
    #Y += noise

    plt.close('all')
    # Kernel object
-    kernel1 = GPy.kern.rbf(X.shape[1])
+    kernel1 = GPy.kern.rbf(X.shape[1]) + GPy.kern.white(X.shape[1])
    kernel2 = kernel1.copy()
    kernel3 = kernel1.copy()
    kernel4 = kernel1.copy()
@ -83,22 +83,24 @@ def debug_student_t_noise_approx():
        #plt.plot(X_full, Y_full)
    #print m

-    #edited_real_sd = initial_var_guess #real_sd
+    edited_real_sd = initial_var_guess #real_sd
    edited_real_sd = real_sd

    print "Clean student t, rasm"
    t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd)
    stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, rasm=True)
    m = GPy.models.GP(X, stu_t_likelihood, kernel6)
+    m['white'] = 1e-3
    #m.constrain_positive('rbf')
    #m.constrain_fixed('rbf_v', 1.0898)
    #m.constrain_fixed('rbf_l', 1.8651)
    #m.constrain_fixed('t_noise_variance', real_sd)
    m.constrain_positive('rbf')
+    m.constrain_positive('t_noise')
    #m.constrain_fixed('t_noi', real_sd)
    m.ensure_default_constraints()
    m.update_likelihood_approximation()
-    m.optimize(messages=True)
+    #m.optimize(messages=True)
    print(m)
    #return m
    #m.optimize('lbfgsb', messages=True, callback=m._update_params_callback)
--- a/GPy/likelihoods/Laplace.py
+++ b/GPy/likelihoods/Laplace.py
@ -84,12 +84,13 @@ class Laplace(likelihood):

        #Implicit
        impl = mdot(dlp, dL_dfhat.T, I_KW_i)
-        expl_a = - mdot(self.Ki_f, self.Ki_f.T)
+        expl_a = mdot(self.Ki_f, self.Ki_f.T)
        expl_b = Wi_K_i
-        expl = 0.5*expl_a - 0.5*expl_b
+        expl = 0.5*expl_a + 0.5*expl_b
        dL_dthetaK_exp = dK_dthetaK(expl, X)
        dL_dthetaK_imp = dK_dthetaK(impl, X)
-        dL_dthetaK = -(dL_dthetaK_imp + dL_dthetaK_exp)
+        #print "dL_dthetaK_exp: {}     dL_dthetaK_implicit: {}".format(dL_dthetaK_exp, dL_dthetaK_imp)
+        dL_dthetaK = dL_dthetaK_imp + dL_dthetaK_exp

        #dL_dthetaK = np.zeros(dK_dthetaK.shape)
        #for thetaK_i, dK_dthetaK_i in enumerate(dK_dthetaK):
@ -117,10 +118,12 @@ class Laplace(likelihood):
            #dL_dthetaL[thetaL_i] = np.sum(dlik_dthetaL[thetaL_i]) - 0.5*np.trace(np.dot(Ki_W_i.T, np.diagflat(dlik_hess_dthetaL[thetaL_i])))
            #dL_dthetaL[thetaL_i] = np.sum(dlik_dthetaL[thetaL_i]) + 0.5*np.dot(Ki_W_i.T, dlik_hess_dthetaL[thetaL_i][:, None])
            #                                               might be +
-            dL_dthetaL[thetaL_i] = np.sum(dlik_dthetaL[thetaL_i]) - 0.5*np.dot(np.diag(self.Ki_W_i), dlik_hess_dthetaL[thetaL_i])
+            dL_dthetaL_exp = np.sum(dlik_dthetaL[thetaL_i]) - 0.5*np.dot(np.diag(self.Ki_W_i), dlik_hess_dthetaL[thetaL_i])
            #Implicit
            df_hat_dthetaL = mdot(I_KW_i, self.K, dlik_grad_dthetaL[thetaL_i])
-            dL_dthetaL[thetaL_i] += np.dot(dL_dfhat.T, df_hat_dthetaL)
+            dL_dthetaL_imp = np.dot(dL_dfhat.T, df_hat_dthetaL)
+            #print "dL_dthetaL_exp: {}     dL_dthetaL_implicit: {}".format(dL_dthetaL_exp, dL_dthetaL_imp)
+            dL_dthetaL[thetaL_i] = dL_dthetaL_imp + dL_dthetaL_exp

        return dL_dthetaL #should be array of length *params-being optimized*, for student t just optimising 1 parameter, this is (1,)

@ -180,14 +183,20 @@ class Laplace(likelihood):
        W = np.diagflat(self.W)
        Wi = self.Sigma_tilde
        W12i = np.sqrt(Wi)
-        D = Ki - mdot((Ki + W), W12i, self.Bi, W12i, (Ki + W))
-        fDf = mdot(self.f_hat.T, D, self.f_hat)
+        #D = Ki - mdot((Ki + W), W12i, self.Bi, W12i, (Ki + W))
+        #fDf = mdot(self.f_hat.T, D, self.f_hat)
        l = self.likelihood_function.link_function(self.data, self.f_hat, extra_data=self.extra_data)
+        #print "fDf:{}   l:{}   detKWiBi:{}   W:{}   Wi:{}   Bi:{}   Ki:{}".format(fDf, l, ln_det_K_Wi__Bi, W.sum(), Wi.sum(), self.Bi.sum(), Ki.sum())
+
+        y_Wi_Ki_i_y = mdot(Y_tilde.T, pdinv(self.K + Wi)[0], Y_tilde)
        Z_tilde = (+ self.NORMAL_CONST
                   + l
                   + 0.5*ln_det_K_Wi__Bi
-                   - 0.5*fDf
+                   #- 0.5*fDf
+                   - 0.5*self.f_Ki_f
+                   + 0.5*y_Wi_Ki_i_y
                  )
+        #print "Ztilde: {}".format(Z_tilde)

        #Convert to float as its (1, 1) and Z must be a scalar
        self.Z = np.float64(Z_tilde)
@ -316,7 +325,7 @@ class Laplace(likelihood):
            #f_old = f.copy()
            W = -self.likelihood_function.d2lik_d2f(self.data, f, extra_data=self.extra_data)
            if not self.likelihood_function.log_concave:
-                W[W < 0] = 1e-6     # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
+                W[W < 0] = 1e-5     # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
                                    # If the likelihood is non-log-concave. We wan't to say that there is a negative variance
                                    # To cause the posterior to become less certain than the prior and likelihood,
                                    # This is a property only held by non-log-concave likelihoods
--- a/GPy/likelihoods/likelihood_functions.py
+++ b/GPy/likelihoods/likelihood_functions.py
@ -170,7 +170,7 @@ class student_t(likelihood_function):
        return np.asarray(self.sigma)

    def _get_param_names(self):
-        return ["t_noise_variance"]
+        return ["t_noise_std"]

    def _set_params(self, x):
        self.sigma = float(x)
@ -191,8 +191,6 @@ class student_t(likelihood_function):
        :returns: float(likelihood evaluated for this point)

        """
-        #y = np.squeeze(y)
-        #f = np.squeeze(f)
        assert y.shape == f.shape

        e = y - f
@ -215,8 +213,6 @@ class student_t(likelihood_function):
        :returns: gradient of likelihood evaluated at points

        """
-        #y = np.squeeze(y)
-        #f = np.squeeze(f)
        assert y.shape == f.shape
        e = y - f
        grad = ((self.v + 1) * e) / (self.v * (self.sigma**2) + (e**2))
@ -237,8 +233,6 @@ class student_t(likelihood_function):
        :extra_data: extra_data which is not used in student t distribution
        :returns: array which is diagonal of covariance matrix (second derivative of likelihood evaluated at points)
        """
-        #y = np.squeeze(y)
-        #f = np.squeeze(f)
        assert y.shape == f.shape

        e = y - f
@ -251,8 +245,6 @@ class student_t(likelihood_function):

        $$\frac{d^{3}p(y_{i}|f_{i})}{d^{3}f} = \frac{-2(v+1)((y_{i} - f_{i})^3 - 3(y_{i} - f_{i}) \sigma^{2} v))}{((y_{i} - f_{i}) + \sigma^{2} v)^3}$$
        """
-        #y = np.squeeze(y)
-        #f = np.squeeze(f)
        assert y.shape == f.shape
        e = y - f
        d3lik_d3f = ( (2*(self.v + 1)*(-e)*(e**2 - 3*self.v*(self.sigma**2))) /
@ -269,8 +261,6 @@ class student_t(likelihood_function):

        $$\frac{dp(y_{i}|f_{i})}{d\sigma} = -\frac{1}{\sigma} + \frac{(1+v)(y_{i}-f_{i})^2}{\sigma^3 v(1 + \frac{1}{v}(\frac{(y_{i} - f_{i})}{\sigma^2})^2)}$$
        """
-        #y = np.squeeze(y)
-        #f = np.squeeze(f)
        assert y.shape == f.shape
        e = y - f
        dlik_dsigma = ( - (1/self.sigma) +
@ -284,8 +274,6 @@ class student_t(likelihood_function):

        $$\frac{d}{d\sigma}(\frac{dp(y_{i}|f_{i})}{df}) = \frac{-2\sigma v(v + 1)(y_{i}-f_{i})}{(y_{i}-f_{i})^2 + \sigma^2 v)^2}$$
        """
-        #y = np.squeeze(y)
-        #f = np.squeeze(f)
        assert y.shape == f.shape
        e = y - f
        dlik_grad_dsigma = ((-2*self.sigma*self.v*(self.v + 1)*e)
@ -299,8 +287,6 @@ class student_t(likelihood_function):

        $$\frac{d}{d\sigma}(\frac{d^{2}p(y_{i}|f_{i})}{d^{2}f}) = \frac{2\sigma v(v + 1)(\sigma^2 v - 3(y-f)^2)}{((y-f)^2 + \sigma^2 v)^3}$$
        """
-        #y = np.squeeze(y)
-        #f = np.squeeze(f)
        assert y.shape == f.shape
        e = y - f
        dlik_hess_dsigma = (  (2*self.sigma*self.v*(self.v + 1)*((self.sigma**2)*self.v - 3*(e**2))) /
--- a/GPy/models/GP.py
+++ b/GPy/models/GP.py
@ -145,18 +145,6 @@ class GP(model):
        self.likelihood._set_params(self.likelihood._get_params())
        dL_dthetaK = self.kern.dK_dtheta(dL_dK=self.dL_dK, X=self.X)
        if isinstance(self.likelihood, Laplace):
-            #Reapproximate incase it hasnt been done...
-            self.likelihood.fit_full(self.kern.K(self.X))
-            self.likelihood._set_params(self.likelihood._get_params())
-            print self.kern._get_params()
-
-            #Need to pass in a matrix of ones to get access to raw dK_dthetaK values without being chained
-            #fake_dL_dKs = np.ones(self.dL_dK.shape) #FIXME: Check this is right...
-            #fake_dL_dKs = np.eye(self.dL_dK.shape[0]) #FIXME: Check this is right...
-
-            #BUG: THIS SHOULD NOT BE (1,num_k_params) matrix it should be (N,N,num_k_params)
-            #dK_dthetaK = self.kern.dK_dtheta(dL_dK=fake_dL_dKs, X=self.X)
-
            dK_dthetaK = self.kern.dK_dtheta
            dL_dthetaK = self.likelihood._Kgradients(dK_dthetaK, self.X)
            dL_dthetaL = self.likelihood._gradients(partial=np.diag(self.dL_dK))