Almost have likelihood gradients working but kernels still way off

2026-05-18 13:55:14 +02:00 · 2013-05-13 18:36:02 +01:00 · 2013-05-13 18:36:02 +01:00 · 5472c5c6ba
commit 5472c5c6ba
parent 9500b12b53
4 changed files with 91 additions and 60 deletions
--- a/GPy/examples/laplace_approximations.py
+++ b/GPy/examples/laplace_approximations.py
@ -52,7 +52,7 @@ def debug_student_t_noise_approx():
    real_sd = np.sqrt(real_var)
    print "Real noise: ", real_sd

-    initial_var_guess = 0.01
+    initial_var_guess = 1
    #t_rv = t(deg_free, loc=0, scale=real_var)
    #noise = t_rvrvs(size=Y.shape)
    #Y += noise
@ -84,14 +84,21 @@ def debug_student_t_noise_approx():

    edited_real_sd = initial_var_guess #real_sd

+    import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
    print "Clean student t, rasm"
    t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd)
    stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, rasm=True)
    m = GPy.models.GP(X, stu_t_likelihood, kernel6)
-    m.ensure_default_constraints()
+    #m.constrain_positive('rbf')
+    m.constrain_fixed('rbf_v', 1.0898)
+    m.constrain_fixed('rbf_l', 1.8651)
+    m.constrain_positive('t_noi')
+    #m.constrain_fixed('t_noise_variance', real_sd)
    m.update_likelihood_approximation()
-    m.optimize()
+    #m.optimize('lbfgsb', messages=True, callback=m._update_params_callback)
+    m.optimize('scg', messages=True)
    print(m)
+    return m
    if plot:
        plt.suptitle('Student-t likelihood')
        plt.subplot(132)
@ -99,19 +106,19 @@ def debug_student_t_noise_approx():
        plt.plot(X_full, Y_full)
        plt.ylim(-2.5, 2.5)

-    print "Clean student t, ncg"
-    t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd)
-    stu_t_likelihood = GPy.likelihoods.Laplace(Y, t_distribution, rasm=False)
-    m = GPy.models.GP(X, stu_t_likelihood, kernel3)
-    m.ensure_default_constraints()
-    m.update_likelihood_approximation()
-    m.optimize()
-    print(m)
-    if plot:
-        plt.subplot(133)
-        m.plot()
-        plt.plot(X_full, Y_full)
-        plt.ylim(-2.5, 2.5)
+    #print "Clean student t, ncg"
+    #t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd)
+    #stu_t_likelihood = GPy.likelihoods.Laplace(Y, t_distribution, rasm=False)
+    #m = GPy.models.GP(X, stu_t_likelihood, kernel3)
+    #m.ensure_default_constraints()
+    #m.update_likelihood_approximation()
+    #m.optimize()
+    #print(m)
+    #if plot:
+        #plt.subplot(133)
+        #m.plot()
+        #plt.plot(X_full, Y_full)
+        #plt.ylim(-2.5, 2.5)

    #plt.show()

--- a/GPy/likelihoods/Laplace.py
+++ b/GPy/likelihoods/Laplace.py
@ -63,6 +63,7 @@ class Laplace(likelihood):
        return self.likelihood_function._get_param_names()

    def _set_params(self, p):
+        #print "Setting laplace param with: ", p
        return self.likelihood_function._set_params(p)

    def both_gradients(self, dL_d_K_Sigma, dK_dthetaK):
@ -78,10 +79,24 @@ class Laplace(likelihood):
        return (self._Kgradients(dL_d_K_Sigma, dK_dthetaK), self._gradients(dL_d_K_Sigma))

    def _shared_gradients_components(self):
-        dL_dytil = -np.dot(self.Y.T, (self.K+self.Sigma_tilde)) #or *0.5? Shouldn't this be -y*R
-        dytil_dfhat = self.Wi__Ki_W # np.dot(self.Sigma_tilde, self.Ki) + np.eye(self.N) # or self.Wi__Ki_W?
-        #Ki, _, _, _ = pdinv(self.K)
-        #dytil_dfhat = np.dot(self.Sigma_tilde, Ki) + np.eye(self.N) # or self.Wi__Ki_W?
+        dL_dytil = -np.dot(self.Y.T, inv(self.K+self.Sigma_tilde)) #or *0.5? Shouldn't this be -y*R
+
+        d3likelihood_d3fhat = self.likelihood_function.d3link(self.data, self.f_hat, self.extra_data)
+        Wi = np.diagonal(self.Sigma_tilde) #Convenience
+        #Can just hadamard product as diagonal matricies multiplied are just multiplying elements
+        dWi_dfhat = np.diagflat(-1*Wi*(-1*d3likelihood_d3fhat)*Wi)
+
+        Ki, _, _, _ = pdinv(self.K)
+        #dytil_dfhat_implicit = np.dot(dWi_dfhat, Ki) + np.eye(self.N)
+        #dytil_dfhat = np.dot(dWi_dfhat, Ki) + np.eye(self.N)
+
+        #Wi(Ki + W) = Wi__Ki_W using the last K prior given to fit_full
+        #dytil_dfhat_explicit = self.Wi__Ki_W
+        #dytil_dfhat = dytil_dfhat_explicit + dytil_dfhat_implicit
+        #dytil_dfhat1 = np.dot(self.Sigma_tilde, Ki) + np.eye(self.N) # or self.Wi__Ki_W? Theyre the same basically
+
+        a = mdot(dWi_dfhat, Ki, self.f_hat)
+        dytil_dfhat = mdot(dWi_dfhat, Ki, self.f_hat) + np.dot(self.Sigma_tilde, Ki) + np.eye(self.N)
        return dL_dytil, dytil_dfhat

    def _Kgradients(self, dL_d_K_Sigma, dK_dthetaK):
@ -94,18 +109,18 @@ class Laplace(likelihood):
        """
        dL_dytil, dytil_dfhat = self._shared_gradients_components()

-
        #dSigma_dfhat = -np.dot(self.Sigma_tilde, np.dot(d3phi_d3fhat, self.Sigma_tilde))

-        print "Computing K gradients"
-        print "dytil_dfhat: ", np.mean(dytil_dfhat)
-        I = np.eye(self.N)
-        C = np.dot(self.K, self.W)
-        A = I + C
+        #print "Computing K gradients"
+        #print "dytil_dfhat: ", np.mean(dytil_dfhat)
+        #I = np.eye(self.N)
+        #C = np.dot(self.K, self.W)
+        #A = I + C
        #plt.imshow(A)
        #plt.show()

        #I_KW_i, _, _, _ = pdinv(A) #FIXME: WHY SO MUCH JITTER?!
+        #B = I + w12*K*w12
        I_KW_i = self.Bi # could use self.B_chol??

        #FIXME: Careful dK_dthetaK is not the derivative with respect to the marginal just prior K!
@ -113,15 +128,22 @@ class Laplace(likelihood):
        dfhat_dthetaK = np.zeros((self.f_hat.shape[0], dK_dthetaK.shape[0]))
        grad = self.likelihood_function.link_grad(self.data, self.f_hat, self.extra_data)
        for ind_j, thetaj in enumerate(dK_dthetaK):
-            dfhat_dthetaK[:, ind_j] = np.dot(I_KW_i, np.dot(thetaj, grad))
+            #dfhat_dthetaK[:, ind_j] = np.dot(thetaj, grad) - np.dot(self.K, np.dot(I_KW_i, np.dot(thetaj, grad)))
+            dfhat_dthetaK[:, ind_j] = np.dot(I_KW_i, thetaj*grad)

+        print "dytil_dfhat: ", np.mean(dytil_dfhat), np.std(dytil_dfhat)
+        print "dfhat_dthetaK: ", np.mean(dfhat_dthetaK), np.std(dfhat_dthetaK)
        dytil_dthetaK = np.dot(dytil_dfhat, dfhat_dthetaK) # should be (D,thetaK)
+        print "dytil_dthetaK: ", np.mean(dytil_dthetaK), np.std(dytil_dthetaK)
+        print "\n"
+
        #FIXME: Careful the -D*0.5 in dL_d_K_sigma might need to be -0.5?
        dL_dSigma = dL_d_K_Sigma
        #d3phi_d3fhat = self.likelihood_function.d3link(self.data, self.f_hat, self.extra_data)
                     #explicit           #implicit
        #dSigmai_dthetaK = 0 + np.dot(d3phi_d3fhat, dfhat_dthetaK)
        #dSigma_dthetaK = np.zeros((self.f_hat.shape[0], self.f_hat.shape[0], dK_dthetaK.shape[0]))
+
        d3likelihood_d3fhat = self.likelihood_function.d3link(self.data, self.f_hat, self.extra_data)
        Wi = np.diagonal(self.Sigma_tilde) #Convenience
        dSigma_dthetaK_explicit = 0
@ -140,19 +162,16 @@ class Laplace(likelihood):
        dL_dthetaK_via_ytil = np.sum(np.dot(dL_dytil, dytil_dthetaK), axis=0)
        dL_dthetaK_via_Sigma = np.sum(np.dot(dL_dSigma, dSigma_dthetaK), axis=0)
        dL_dthetaK_implicit = dL_dthetaK_via_ytil + dL_dthetaK_via_Sigma
-        #dL_dthetaK_implicit = np.dot(dL_dytil.T, dytil_dthetaK.T)

-        #print "\n"
-        #print "dL_dytil: ", np.mean(dL_dytil)
-        #print "dytil_dthetaK: ", np.mean(dytil_dthetaK)
-        #print "dL_dthetaK_via_ytil: ", dL_dthetaK_via_ytil
-        #print "\n"
-        #print "dL_dSigma: ", np.mean(dL_dSigma)
-        #print "dSigma_dthetaK: ", np.mean(dSigma_dthetaK)
-        #print "dL_dthetaK_via_Sigma: ", dL_dthetaK_via_Sigma
-        #print "\n"
-        #print "dL_dthetaK_implicit: ", dL_dthetaK_implicit
-        #import ipdb; ipdb.set_trace()  # XXX BREAKPOINT
+        print "dL_dytil: ", np.mean(dL_dytil), np.std(dL_dytil)
+        print "dytil_dthetaK: ", np.mean(dytil_dthetaK), np.std(dytil_dthetaK)
+        print "dL_dthetaK_via_ytil: ", dL_dthetaK_via_ytil
+        print "\n"
+        print "dL_dSigma: ", np.mean(dL_dSigma), np.std(dL_dSigma)
+        print "dSigma_dthetaK: ", np.mean(dSigma_dthetaK), np.std(dSigma_dthetaK)
+        print "dL_dthetaK_via_Sigma: ", dL_dthetaK_via_Sigma
+        print "\n"
+        print "dL_dthetaK_implicit: ", dL_dthetaK_implicit

        return np.squeeze(dL_dthetaK_implicit)

@ -182,11 +201,15 @@ class Laplace(likelihood):
        dL_dytil, dytil_dfhat = self._shared_gradients_components()
        #dfhat_dthetaL, dSigmai_dthetaL = self.likelihood_function._gradients(self.data, self.f_hat, self.extra_data) #FIXME: Shouldn't this have a implicit component aswell?

-        dlikelihood_dthetaL_explicit, d2likelihood_dthetaL = self.likelihood_function._gradients(self.data, self.f_hat, self.extra_data) #FIXME: Shouldn't this have a implicit component aswell?
-        dlikelihood_dfhat = self.likelihood_function.link_hess(self.data, self.f_hat, self.extra_data)
-        dfhat_dthetaL_cyclic = 0 #what is this? how can dfhat_dthetaL be used in the value of itself?
-        dlikelihood_dthetaL_implicit = np.dot(dlikelihood_dfhat, dfhat_dthetaL_cyclic) # may need a sum over f
-        dfhat_dthetaL = np.dot(self.K, (dlikelihood_dthetaL_explicit + dlikelihood_dthetaL_implicit)[:, None])
+        dlikelihood_dthetaL, d2likelihood_dthetaL = self.likelihood_function._gradients(self.data, self.f_hat, self.extra_data) #FIXME: Shouldn't this have a implicit component aswell?
+        dlikelihood_dfhat = self.likelihood_function.link_grad(self.data, self.f_hat, self.extra_data)
+        #dfhat_dthetaL_cyclic = 0 #FIXME: what is this? how can dfhat_dthetaL be used in the value of itself?
+        #dlikelihood_dthetaL_implicit = np.dot(dlikelihood_dfhat, dfhat_dthetaL_cyclic) # may need a sum over f
+        #dfhat_dthetaL = np.dot(self.K, (dlikelihood_dthetaL_explicit + dlikelihood_dthetaL_implicit)[:, None])
+        #KW_I_i, _, _, _ = pdinv(np.dot(self.K, self.W) + np.eye(self.N))
+        KW_I_i = self.Bi # could use self.B_chol??
+        dfhat_dthetaL = mdot(KW_I_i, (self.K, dlikelihood_dfhat))
+
        dytil_dthetaL = np.dot(dytil_dfhat, dfhat_dthetaL)

        #FIXME: Careful the -D*0.5 in dL_d_K_sigma might need to be -0.5?
@ -199,7 +222,7 @@ class Laplace(likelihood):

        d3likelihood_d3fhat = self.likelihood_function.d3link(self.data, self.f_hat, self.extra_data)
        dWi_dfhat = np.diagflat(-1*Wi*(-1*d3likelihood_d3fhat)*Wi)
-        dSigma_dthetaL_implicit = np.dot(dWi_dfhat, dfhat_dthetaL_cyclic)
+        dSigma_dthetaL_implicit = np.dot(dWi_dfhat, dfhat_dthetaL)
        dSigma_dthetaL = dSigma_dthetaL_explicit + dSigma_dthetaL_implicit

        #dSigmai_dthetaL = self.likelihood_function._gradients(self.data, self.f_hat, self.extra_data) #FIXME: Shouldn't this have a implicit component aswell?
@ -219,8 +242,10 @@ class Laplace(likelihood):
        #dL_dthetaL = 0 + np.dot(dL_dytil, dytil_dthetaL)# + np.dot(dL_dSigma, dSigma_dthetaL)

        dL_dthetaL_via_ytil = np.sum(np.dot(dL_dytil, dytil_dthetaL), axis=0)
-        dL_dthetaL_via_Sigma = np.sum(np.dot(dL_dSigma, dSigma_dthetaL), axis=0)
+        dL_dthetaL_via_Sigma = np.sum(np.dot(dL_dSigma[:, None].T, dSigma_dthetaL), axis=0)
        dL_dthetaL = dL_dthetaL_via_ytil + dL_dthetaL_via_Sigma
+        dL_dthetaL_via_Sigma_old = np.sum(np.dot(dL_dSigma, dSigma_dthetaL), axis=0)
+        import ipdb; ipdb.set_trace() ### XXX BREAKPOINT

        return np.squeeze(dL_dthetaL) #should be array of length *params-being optimized*, for student t just optimising 1 parameter, this is (1,)

@ -257,7 +282,7 @@ class Laplace(likelihood):
        #((L.T*w)_i + I)f_hat = y_tilde
        L = jitchol(self.K)
        Li = chol_inv(L)
-        Lt_W = np.dot(L.T, self.W)
+        Lt_W = np.dot(L.T, self.W) #FIXME: Can make Faster

        ##Check it isn't singular!
        if cond(Lt_W) > epsilon:
@ -361,7 +386,6 @@ class Laplace(likelihood):
        """
        #W is diagnoal so its sqrt is just the sqrt of the diagonal elements
        W_12 = np.sqrt(W)
-        #import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
        B = np.eye(K.shape[0]) + np.dot(W_12, np.dot(K, W_12))
        L = jitchol(B)
        return (B, L, W_12)
--- a/GPy/likelihoods/likelihood_functions.py
+++ b/GPy/likelihoods/likelihood_functions.py
@ -176,8 +176,6 @@ class student_t(likelihood_function):

    def _set_params(self, x):
        self.sigma = float(x)
-        print "Setting student t sigma: ", x
-        print x
        #self.covariance_matrix = np.eye(self.N)*self._variance
        #self.precision = 1./self._variance

@ -288,7 +286,7 @@ class student_t(likelihood_function):
        f = np.squeeze(f)
        assert y.shape == f.shape
        e = y - f
-        grad_sigma = (  (-2*self.sigma*self.v*(self.v + 1)*e)
+        grad_sigma = (  (2*self.sigma*self.v*(self.v + 1)*e)
                      / ((self.v*(self.sigma**2) + e**2)**2)
                     )
        return np.squeeze(grad_sigma)
--- a/GPy/models/GP.py
+++ b/GPy/models/GP.py
@ -66,6 +66,10 @@ class GP(model):
        # self.likelihood._set_params(p[self.kern.Nparam:])               # test by Nicolas
        self.likelihood._set_params(p[self.kern.Nparam_transformed():])  # test by Nicolas

+        if isinstance(self.likelihood, Laplace):
+            print "Updating approx: ", p
+            self.likelihood.fit_full(self.kern.K(self.X))
+            self.likelihood._set_params(self.likelihood._get_params())

        self.K = self.kern.K(self.X)
        self.K += self.likelihood.covariance_matrix
@ -87,14 +91,12 @@ class GP(model):
        return self.kern._get_param_names_transformed() + self.likelihood._get_param_names()

    def _update_params_callback(self, p):
-        #FIXME:Check the transforming
-        #Set the new parameters of the kernel and likelihood within the optimization
-        import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
+        #parameters will be in transformed space
        self.kern._set_params_transformed(p[:self.kern.Nparam_transformed()])
+        #set_params_transformed for likelihood doesn't exist?
        self.likelihood._set_params(p[self.kern.Nparam_transformed():])
        #update the likelihood approximation within the optimisation with the current parameters
        self.update_likelihood_approximation()
-        import ipdb; ipdb.set_trace() ### XXX BREAKPOINT

    def update_likelihood_approximation(self):
        """
@ -123,7 +125,9 @@ class GP(model):
        model for a new variable Y* = v_tilde/tau_tilde, with a covariance
        matrix K* = K + diag(1./tau_tilde) plus a normalization term.
        """
-        return -0.5 * self.D * self.K_logdet + self._model_fit_term() + self.likelihood.Z
+        l = -0.5 * self.D * self.K_logdet + self._model_fit_term() + self.likelihood.Z
+        print "Log likelihood: ", l
+        return l

    def _log_likelihood_gradients(self):
        """
@ -135,7 +139,7 @@ class GP(model):
        if isinstance(self.likelihood, Laplace):
            dL_dthetaK_explicit = dL_dthetaK
            #Need to pass in a matrix of ones to get access to raw dK_dthetaK values without being chained
-            fake_dL_dKs = np.eye(self.dL_dK.shape[0])
+            fake_dL_dKs = np.eye(self.dL_dK.shape[0]) #FIXME: Check this is right...
            dK_dthetaK = self.kern.dK_dtheta(dL_dK=fake_dL_dKs, X=self.X)

            #We need the dL_dK where K is equal to the prior K, not K+Sigma as is the case now
@ -145,13 +149,11 @@ class GP(model):
            #print "dL_dthetaK_explicit: {dldkx}     dL_dthetaK_implicit: {dldki}        dL_dthetaK: {dldk}".format(dldkx=dL_dthetaK_explicit, dldki=dL_dthetaK_implicit, dldk=dL_dthetaK)

            dL_dthetaL = self.likelihood._gradients(partial=np.diag(self.dL_dK))
+            #print "dL_dthetaL: ", dL_dthetaL
            print "Stacked dL_dthetaK, dL_dthetaL: ", np.hstack((dL_dthetaK, dL_dthetaL))
-            import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
        else:
-            #print "dL_dthetaK: ", dL_dthetaK
            dL_dthetaL = self.likelihood._gradients(partial=np.diag(self.dL_dK))
            print "Stacked dL_dthetaK, dL_dthetaL: ", np.hstack((dL_dthetaK, dL_dthetaL))
-        #print "dL_dthetaL: ", dL_dthetaL
        return np.hstack((dL_dthetaK, dL_dthetaL))
        #return np.hstack((self.kern.dK_dtheta(dL_dK=self.dL_dK, X=self.X), self.likelihood._gradients(partial=np.diag(self.dL_dK))))