Ripped out all things Laplace parameter estimation, starting again with new tactic

2026-07-26 17:11:06 +02:00 · 2013-05-29 14:02:03 +01:00 · 2013-05-29 14:02:03 +01:00 · 117c377d13
commit 117c377d13
parent d63d370641
2 changed files with 4 additions and 179 deletions
--- a/GPy/likelihoods/Laplace.py
+++ b/GPy/likelihoods/Laplace.py
@ -79,187 +79,18 @@ class Laplace(likelihood):
        return (self._Kgradients(dL_d_K_Sigma, dK_dthetaK), self._gradients(dL_d_K_Sigma))
    def _shared_gradients_components(self):
        Ki, _, _, _ = pdinv(self.K)
        #Y__KS_i = np.dot(self.Y.T, inv(self.K+self.Sigma_tilde))
        #dL_dytil = -0.5*Y__KS_i #or *0.5? Shouldn't this be -y*R
        #dL_dytil = -0.5*np.trace(np.dot(inv(self.K+self.Sigma_tilde), (np.dot(self.Y, self.Y.T) + self.Y.T)))
        #dL_dytil_simple_term = -0.5*np.dot(inv(self.K+self.Sigma_tilde),
        #dL_dytil_simple_term = -np.dot(self.Y.T, inv(self.K+self.Sigma_tilde), self.Y)
        c = inv(self.K+self.Sigma_tilde)
        dL_dytil_simple_term = -0.5*np.diag(2*np.dot(c, self.Y))
        P = np.diagflat(1/np.dot(Ki, self.f_hat))
        K_Wi_i = inv(self.K+self.Sigma_tilde)
        dL_dytil_difficult_term = np.diag(( -0.5*(np.dot(self.K + self.Sigma_tilde, P))
                                            +0.5*mdot(K_Wi_i, self.Y, self.Y.T, K_Wi_i, P)
                                           ) * np.eye(self.N))
        dL_dytil = dL_dytil_simple_term + dL_dytil_difficult_term
        dL_dytil = dL_dytil.reshape(1, self.N)
        d3likelihood_d3fhat = self.likelihood_function.d3link(self.data, self.f_hat, self.extra_data)
        Wi = np.diagonal(self.Sigma_tilde) #Convenience
        #Can just hadamard product as diagonal matricies multiplied are just multiplying elements
        dWi_dfhat = np.diagflat(-1*Wi*(-1*d3likelihood_d3fhat)*Wi)
        #Wi(Ki + W) = Wi__Ki_W using the last K prior given to fit_full
        #dytil_dfhat_explicit = self.Wi__Ki_W
        #dytil_dfhat = dytil_dfhat_explicit + dytil_dfhat_implicit
        #dytil_dfhat1 = np.dot(self.Sigma_tilde, Ki) + np.eye(self.N) # or self.Wi__Ki_W? Theyre the same basically
        dytil_dfhat = - np.diagflat(np.dot(dWi_dfhat, np.dot(Ki, self.f_hat))) + np.dot(self.Sigma_tilde, Ki) + np.eye(self.N)
        self.dytil_dfhat = dytil_dfhat
        #dytil_dfhat = np.eye(dytil_dfhat.shape[0])
        self.dL_dfhat = np.dot(dL_dytil, dytil_dfhat) #FIXME: Purely for checkgradding....
        return dL_dytil, dytil_dfhat
    def _Kgradients(self, dL_d_K_Sigma, dK_dthetaK):
        """
-                           #explicit                #implicit                     #implicit
+        Gradients with respect to prior kernel parameters
        dL_dtheta_K = (dL_dK * dK_dthetaK) + (dL_dytil * dytil_dthetaK) + (dL_dSigma * dSigma_dthetaK)
        :param dL_d_K_Sigma: Derivative of marginal with respect to K_prior+Sigma_tilde (posterior covariance)
        :param dK_dthetaK: explcit derivative of kernel with respect to its hyper paramers
        :returns: dL_dthetaK - gradients of marginal likelihood w.r.t changes in K hyperparameters
        """
-        dL_dytil, dytil_dfhat = self._shared_gradients_components()
+        return dL_dthetaK
        #dSigma_dfhat = -np.dot(self.Sigma_tilde, np.dot(d3phi_d3fhat, self.Sigma_tilde))
        #print "Computing K gradients"
        #print "dytil_dfhat: ", np.mean(dytil_dfhat)
        #I = np.eye(self.N)
        #C = np.dot(self.K, self.W)
        #A = I + C
        #plt.imshow(A)
        #plt.show()
        #I_KW_i, _, _, _ = pdinv(A) #FIXME: WHY SO MUCH JITTER?!
        #B = I + w12*K*w12
        I_KW_i = self.Bi # could use self.B_chol??
        #FIXME: Careful dK_dthetaK is not the derivative with respect to the marginal just prior K!
        #Derivative for each f dimension, for each of K's hyper parameters
        dfhat_dthetaK = np.zeros((self.f_hat.shape[0], dK_dthetaK.shape[0]))
        grad = self.likelihood_function.link_grad(self.data, self.f_hat, self.extra_data)
        for ind_j, thetaj in enumerate(dK_dthetaK):
            #dfhat_dthetaK[:, ind_j] = np.dot(thetaj, grad) - np.dot(self.K, np.dot(I_KW_i, np.dot(thetaj, grad)))
            dfhat_dthetaK[:, ind_j] = np.dot(I_KW_i, thetaj*grad)
        print "dytil_dfhat: ", np.mean(dytil_dfhat), np.std(dytil_dfhat)
        print "dfhat_dthetaK: ", np.mean(dfhat_dthetaK), np.std(dfhat_dthetaK)
        dytil_dthetaK = np.dot(dytil_dfhat, dfhat_dthetaK) # should be (D,thetaK)
        print "dytil_dthetaK: ", np.mean(dytil_dthetaK), np.std(dytil_dthetaK)
        print "\n"
        #FIXME: Careful the -D*0.5 in dL_d_K_sigma might need to be -0.5?
        dL_dSigma = dL_d_K_Sigma
        #d3phi_d3fhat = self.likelihood_function.d3link(self.data, self.f_hat, self.extra_data)
                     #explicit           #implicit
        #dSigmai_dthetaK = 0 + np.dot(d3phi_d3fhat, dfhat_dthetaK)
        #dSigma_dthetaK = np.zeros((self.f_hat.shape[0], self.f_hat.shape[0], dK_dthetaK.shape[0]))
        d3likelihood_d3fhat = self.likelihood_function.d3link(self.data, self.f_hat, self.extra_data)
        Wi = np.diagonal(self.Sigma_tilde) #Convenience
        dSigma_dthetaK_explicit = 0
        #Can just hadamard product as diagonal matricies multiplied are just multiplying elements
        dWi_dfhat = np.diagflat(-1*Wi*(-1*d3likelihood_d3fhat)*Wi)
        #dSigma_dthetaK_implicit = -np.sum(np.dot(dWi_dfhat, dfhat_dthetaK), axis=0)
        dSigma_dthetaK_implicit = np.dot(dWi_dfhat, dfhat_dthetaK)
        dSigma_dthetaK = dSigma_dthetaK_explicit + dSigma_dthetaK_implicit
        #dSigma_dthetaK = 0 + np.dot(, dfhat_dthetaK)
        #for ind_j, dSigmai_dthetaj in enumerate(dSigmai_dthetaK):
            #dSigma_dthetaK_explicit = 0
            #dSigma_dthetaK_implicit = -np.dot(Wi, dW_dfhat
            #dSigma_dthetaK[:, :, ind_j] = -np.dot(self.Sigma_tilde, dSigmai_dthetaj*self.Sigma_tilde)
        #FIXME: Won't handle multi dimensional data
        dL_dthetaK_via_ytil = np.sum(np.dot(dL_dytil, dytil_dthetaK), axis=0)
        dL_dthetaK_via_Sigma = np.sum(np.dot(dL_dSigma, dSigma_dthetaK), axis=0)
        dL_dthetaK_implicit = dL_dthetaK_via_ytil + dL_dthetaK_via_Sigma
        print "dL_dytil: ", np.mean(dL_dytil), np.std(dL_dytil)
        print "dytil_dthetaK: ", np.mean(dytil_dthetaK), np.std(dytil_dthetaK)
        print "dL_dthetaK_via_ytil: ", dL_dthetaK_via_ytil
        print "\n"
        print "dL_dSigma: ", np.mean(dL_dSigma), np.std(dL_dSigma)
        print "dSigma_dthetaK: ", np.mean(dSigma_dthetaK), np.std(dSigma_dthetaK)
        print "dL_dthetaK_via_Sigma: ", dL_dthetaK_via_Sigma
        print "\n"
        print "dL_dthetaK_implicit: ", dL_dthetaK_implicit
        return np.squeeze(dL_dthetaK_implicit)
    def _gradients(self, partial):
        """
        Gradients with respect to likelihood parameters
        Complicated, it differs for parameters of the kernel \theta_{K}, and
        parameters of the likelihood, \theta_{L}
        dL_dtheta_K = (dL_dK * dK_dthetaK) + (dL_dytil * dytil_dthetaK) + (dL_dSigma * dSigma_dthetaK)
        dL_dtheta_L = (dL_dK * dK_dthetaL) + (dL_dytil * dytil_dthetaL) + (dL_dSigma * dSigma_dthetaL)
        dL_dK*dK_dthetaL = 0
        dytil_dthetaX = dytil_dfhat * dfhat_dthetaX
        dytil_dfhat = Sigma*Ki + I
        fhat = K*log_p(y|fhat)                                          from rasm p125
        dfhat_dthetaK = (I + KW)i * dK_dthetaK * log_p(y|fhat)          from rasm p125
        dSigma_dthetaX = dWi_dthetaX = -Wi * dW_dthetaX * Wi
        dW_dthetaX = d_dthetaX[d2phi_d2fhat]
        d2phi_d2fhat = Hessian function of likelihood
        partial = dL_d_K_Sigma
        """
-        dL_dytil, dytil_dfhat = self._shared_gradients_components()
+        return dL_dthetaL #should be array of length *params-being optimized*, for student t just optimising 1 parameter, this is (1,)
        #dfhat_dthetaL, dSigmai_dthetaL = self.likelihood_function._gradients(self.data, self.f_hat, self.extra_data) #FIXME: Shouldn't this have a implicit component aswell?
        dlikelihoodgrad_dthetaL, d2likelihood_dthetaL = self.likelihood_function._gradients(self.data, self.f_hat, self.extra_data) #FIXME: Shouldn't this have a implicit component aswell?
        dlikelihood_dfhat = self.likelihood_function.link_grad(self.data, self.f_hat, self.extra_data)
        KW_I_i, _, _, _ = pdinv(np.dot(self.K, self.W) + np.eye(self.N))
        #KW_I_i = self.Bi # could use self.B_chol??
        dfhat_dthetaL = mdot(KW_I_i, (self.K, dlikelihoodgrad_dthetaL))
        dfhat_dthetaL = np.zeros(dfhat_dthetaL.shape)[:, None]
        dytil_dthetaL = np.dot(dytil_dfhat, dfhat_dthetaL)
        #FIXME: Careful the -D*0.5 in dL_d_K_sigma might need to be -0.5?
        dL_dSigma = np.diagflat(partial) #Is actually but can't rename it because of naming convention... dL_d_K_Sigma
        Wi = np.diagonal(self.Sigma_tilde) #Convenience
        #-1 as we are looking at W which is -1*d2log p(y|f)
        #Can just hadamard product as diagonal matricies multiplied are just multiplying elements
        dSigma_dthetaL_explicit = np.diagflat(-1*(Wi*(-1*d2likelihood_dthetaL)*Wi))
        d3likelihood_d3fhat = self.likelihood_function.d3link(self.data, self.f_hat, self.extra_data)
        dWi_dfhat = np.diagflat(-1*Wi*(-1*d3likelihood_d3fhat)*Wi)
        dSigma_dthetaL_implicit = np.dot(dWi_dfhat, dfhat_dthetaL)
        dSigma_dthetaL = dSigma_dthetaL_explicit + dSigma_dthetaL_implicit
        #dSigmai_dthetaL = self.likelihood_function._gradients(self.data, self.f_hat, self.extra_data) #FIXME: Shouldn't this have a implicit component aswell?
        #Derivative for each f dimension, for each of K's hyper parameters
        #dSigma_dthetaL = np.empty((self.N, len(self.likelihood_function._get_param_names())))
        #for ind_l, dSigmai_dtheta_l in enumerate(dSigmai_dthetaL.T):
            #dSigma_dthetaL[:, ind_l] = -mdot(self.Sigma_tilde,
                                             #dSigmai_dtheta_l, # Careful, shouldn't this be (N, 1)?
                                             #self.Sigma_tilde
                                             #)
        #TODO: This is Wi*A*Wi, can be more numerically stable with a trick
        #dSigma_dthetaL = -mdot(self.Sigma_tilde, dSigmai_dthetaL, self.Sigma_tilde)
        #dytil_dthetaL = dytil_dfhat*dfhat_dthetaL
        #dytil_dthetaL = np.dot(dytil_dfhat, dfhat_dthetaL)
        #dL_dthetaL = 0 + np.dot(dL_dytil, dytil_dthetaL)# + np.dot(dL_dSigma, dSigma_dthetaL)
        dL_dthetaL_via_ytil = np.sum(np.dot(dL_dytil, dytil_dthetaL), axis=0)
        dL_dthetaL_via_Sigma = np.sum(np.sum(np.dot(dL_dSigma, dSigma_dthetaL), axis=0))
        dL_dthetaL = dL_dthetaL_via_ytil + dL_dthetaL_via_Sigma
        return np.squeeze(dL_dthetaL) #should be array of length *params-being optimized*, for student t just optimising 1 parameter, this is (1,)
    def _compute_GP_variables(self):
        """
--- a/GPy/models/GP.py
+++ b/GPy/models/GP.py
@ -150,14 +150,8 @@ class GP(model):
            fake_dL_dKs = np.eye(self.dL_dK.shape[0]) #FIXME: Check this is right...
            dK_dthetaK = self.kern.dK_dtheta(dL_dK=fake_dL_dKs, X=self.X)
-            #We need the dL_dK where K is equal to the prior K, not K+Sigma as is the case now
+            dL_dthetaK = self.likelihood._Kgradients(dL_d_K_Sigma=self.dL_dK, dK_dthetaK=dK_dthetaK)
            dL_dthetaK_implicit = self.likelihood._Kgradients(dL_d_K_Sigma=self.dL_dK, dK_dthetaK=dK_dthetaK)
            dL_dthetaK = dL_dthetaK_explicit + dL_dthetaK_implicit
            #print "dL_dthetaK_explicit: {dldkx}     dL_dthetaK_implicit: {dldki}        dL_dthetaK: {dldk}".format(dldkx=dL_dthetaK_explicit, dldki=dL_dthetaK_implicit, dldk=dL_dthetaK)
            dL_dthetaL = self.likelihood._gradients(partial=np.diag(self.dL_dK))
            #print "dL_dthetaL: ", dL_dthetaL
            print "Stacked dL_dthetaK, dL_dthetaL: ", np.hstack((dL_dthetaK, dL_dthetaL))
        else:
            dL_dthetaL = self.likelihood._gradients(partial=np.diag(self.dL_dK))