From 117c377d13efe81b2df567936ff48e85f918efcd Mon Sep 17 00:00:00 2001 From: Alan Saul Date: Wed, 29 May 2013 14:02:03 +0100 Subject: [PATCH] Ripped out all things Laplace parameter estimation, starting again with new tactic --- GPy/likelihoods/Laplace.py | 175 +------------------------------------ GPy/models/GP.py | 8 +- 2 files changed, 4 insertions(+), 179 deletions(-) diff --git a/GPy/likelihoods/Laplace.py b/GPy/likelihoods/Laplace.py index 666fa227..69c0876b 100644 --- a/GPy/likelihoods/Laplace.py +++ b/GPy/likelihoods/Laplace.py @@ -79,187 +79,18 @@ class Laplace(likelihood): return (self._Kgradients(dL_d_K_Sigma, dK_dthetaK), self._gradients(dL_d_K_Sigma)) def _shared_gradients_components(self): - Ki, _, _, _ = pdinv(self.K) - - #Y__KS_i = np.dot(self.Y.T, inv(self.K+self.Sigma_tilde)) - #dL_dytil = -0.5*Y__KS_i #or *0.5? Shouldn't this be -y*R - #dL_dytil = -0.5*np.trace(np.dot(inv(self.K+self.Sigma_tilde), (np.dot(self.Y, self.Y.T) + self.Y.T))) - #dL_dytil_simple_term = -0.5*np.dot(inv(self.K+self.Sigma_tilde), - #dL_dytil_simple_term = -np.dot(self.Y.T, inv(self.K+self.Sigma_tilde), self.Y) - c = inv(self.K+self.Sigma_tilde) - dL_dytil_simple_term = -0.5*np.diag(2*np.dot(c, self.Y)) - - P = np.diagflat(1/np.dot(Ki, self.f_hat)) - K_Wi_i = inv(self.K+self.Sigma_tilde) - - dL_dytil_difficult_term = np.diag(( -0.5*(np.dot(self.K + self.Sigma_tilde, P)) - +0.5*mdot(K_Wi_i, self.Y, self.Y.T, K_Wi_i, P) - ) * np.eye(self.N)) - dL_dytil = dL_dytil_simple_term + dL_dytil_difficult_term - dL_dytil = dL_dytil.reshape(1, self.N) - - d3likelihood_d3fhat = self.likelihood_function.d3link(self.data, self.f_hat, self.extra_data) - Wi = np.diagonal(self.Sigma_tilde) #Convenience - #Can just hadamard product as diagonal matricies multiplied are just multiplying elements - dWi_dfhat = np.diagflat(-1*Wi*(-1*d3likelihood_d3fhat)*Wi) - - - #Wi(Ki + W) = Wi__Ki_W using the last K prior given to fit_full - #dytil_dfhat_explicit = self.Wi__Ki_W - #dytil_dfhat = dytil_dfhat_explicit + dytil_dfhat_implicit - #dytil_dfhat1 = np.dot(self.Sigma_tilde, Ki) + np.eye(self.N) # or self.Wi__Ki_W? Theyre the same basically - - dytil_dfhat = - np.diagflat(np.dot(dWi_dfhat, np.dot(Ki, self.f_hat))) + np.dot(self.Sigma_tilde, Ki) + np.eye(self.N) - self.dytil_dfhat = dytil_dfhat - #dytil_dfhat = np.eye(dytil_dfhat.shape[0]) - self.dL_dfhat = np.dot(dL_dytil, dytil_dfhat) #FIXME: Purely for checkgradding.... - return dL_dytil, dytil_dfhat def _Kgradients(self, dL_d_K_Sigma, dK_dthetaK): """ - #explicit #implicit #implicit - dL_dtheta_K = (dL_dK * dK_dthetaK) + (dL_dytil * dytil_dthetaK) + (dL_dSigma * dSigma_dthetaK) - :param dL_d_K_Sigma: Derivative of marginal with respect to K_prior+Sigma_tilde (posterior covariance) - :param dK_dthetaK: explcit derivative of kernel with respect to its hyper paramers - :returns: dL_dthetaK - gradients of marginal likelihood w.r.t changes in K hyperparameters + Gradients with respect to prior kernel parameters """ - dL_dytil, dytil_dfhat = self._shared_gradients_components() - - #dSigma_dfhat = -np.dot(self.Sigma_tilde, np.dot(d3phi_d3fhat, self.Sigma_tilde)) - - #print "Computing K gradients" - #print "dytil_dfhat: ", np.mean(dytil_dfhat) - #I = np.eye(self.N) - #C = np.dot(self.K, self.W) - #A = I + C - #plt.imshow(A) - #plt.show() - - #I_KW_i, _, _, _ = pdinv(A) #FIXME: WHY SO MUCH JITTER?! - #B = I + w12*K*w12 - I_KW_i = self.Bi # could use self.B_chol?? - - #FIXME: Careful dK_dthetaK is not the derivative with respect to the marginal just prior K! - #Derivative for each f dimension, for each of K's hyper parameters - dfhat_dthetaK = np.zeros((self.f_hat.shape[0], dK_dthetaK.shape[0])) - grad = self.likelihood_function.link_grad(self.data, self.f_hat, self.extra_data) - for ind_j, thetaj in enumerate(dK_dthetaK): - #dfhat_dthetaK[:, ind_j] = np.dot(thetaj, grad) - np.dot(self.K, np.dot(I_KW_i, np.dot(thetaj, grad))) - dfhat_dthetaK[:, ind_j] = np.dot(I_KW_i, thetaj*grad) - - print "dytil_dfhat: ", np.mean(dytil_dfhat), np.std(dytil_dfhat) - print "dfhat_dthetaK: ", np.mean(dfhat_dthetaK), np.std(dfhat_dthetaK) - dytil_dthetaK = np.dot(dytil_dfhat, dfhat_dthetaK) # should be (D,thetaK) - print "dytil_dthetaK: ", np.mean(dytil_dthetaK), np.std(dytil_dthetaK) - print "\n" - - #FIXME: Careful the -D*0.5 in dL_d_K_sigma might need to be -0.5? - dL_dSigma = dL_d_K_Sigma - #d3phi_d3fhat = self.likelihood_function.d3link(self.data, self.f_hat, self.extra_data) - #explicit #implicit - #dSigmai_dthetaK = 0 + np.dot(d3phi_d3fhat, dfhat_dthetaK) - #dSigma_dthetaK = np.zeros((self.f_hat.shape[0], self.f_hat.shape[0], dK_dthetaK.shape[0])) - - d3likelihood_d3fhat = self.likelihood_function.d3link(self.data, self.f_hat, self.extra_data) - Wi = np.diagonal(self.Sigma_tilde) #Convenience - dSigma_dthetaK_explicit = 0 - #Can just hadamard product as diagonal matricies multiplied are just multiplying elements - dWi_dfhat = np.diagflat(-1*Wi*(-1*d3likelihood_d3fhat)*Wi) - #dSigma_dthetaK_implicit = -np.sum(np.dot(dWi_dfhat, dfhat_dthetaK), axis=0) - dSigma_dthetaK_implicit = np.dot(dWi_dfhat, dfhat_dthetaK) - dSigma_dthetaK = dSigma_dthetaK_explicit + dSigma_dthetaK_implicit - #dSigma_dthetaK = 0 + np.dot(, dfhat_dthetaK) - #for ind_j, dSigmai_dthetaj in enumerate(dSigmai_dthetaK): - #dSigma_dthetaK_explicit = 0 - #dSigma_dthetaK_implicit = -np.dot(Wi, dW_dfhat - #dSigma_dthetaK[:, :, ind_j] = -np.dot(self.Sigma_tilde, dSigmai_dthetaj*self.Sigma_tilde) - - #FIXME: Won't handle multi dimensional data - dL_dthetaK_via_ytil = np.sum(np.dot(dL_dytil, dytil_dthetaK), axis=0) - dL_dthetaK_via_Sigma = np.sum(np.dot(dL_dSigma, dSigma_dthetaK), axis=0) - dL_dthetaK_implicit = dL_dthetaK_via_ytil + dL_dthetaK_via_Sigma - - print "dL_dytil: ", np.mean(dL_dytil), np.std(dL_dytil) - print "dytil_dthetaK: ", np.mean(dytil_dthetaK), np.std(dytil_dthetaK) - print "dL_dthetaK_via_ytil: ", dL_dthetaK_via_ytil - print "\n" - print "dL_dSigma: ", np.mean(dL_dSigma), np.std(dL_dSigma) - print "dSigma_dthetaK: ", np.mean(dSigma_dthetaK), np.std(dSigma_dthetaK) - print "dL_dthetaK_via_Sigma: ", dL_dthetaK_via_Sigma - print "\n" - print "dL_dthetaK_implicit: ", dL_dthetaK_implicit - - return np.squeeze(dL_dthetaK_implicit) + return dL_dthetaK def _gradients(self, partial): """ Gradients with respect to likelihood parameters - - Complicated, it differs for parameters of the kernel \theta_{K}, and - parameters of the likelihood, \theta_{L} - - dL_dtheta_K = (dL_dK * dK_dthetaK) + (dL_dytil * dytil_dthetaK) + (dL_dSigma * dSigma_dthetaK) - dL_dtheta_L = (dL_dK * dK_dthetaL) + (dL_dytil * dytil_dthetaL) + (dL_dSigma * dSigma_dthetaL) - dL_dK*dK_dthetaL = 0 - - dytil_dthetaX = dytil_dfhat * dfhat_dthetaX - dytil_dfhat = Sigma*Ki + I - - fhat = K*log_p(y|fhat) from rasm p125 - dfhat_dthetaK = (I + KW)i * dK_dthetaK * log_p(y|fhat) from rasm p125 - - dSigma_dthetaX = dWi_dthetaX = -Wi * dW_dthetaX * Wi - dW_dthetaX = d_dthetaX[d2phi_d2fhat] - d2phi_d2fhat = Hessian function of likelihood - - partial = dL_d_K_Sigma """ - dL_dytil, dytil_dfhat = self._shared_gradients_components() - #dfhat_dthetaL, dSigmai_dthetaL = self.likelihood_function._gradients(self.data, self.f_hat, self.extra_data) #FIXME: Shouldn't this have a implicit component aswell? - - dlikelihoodgrad_dthetaL, d2likelihood_dthetaL = self.likelihood_function._gradients(self.data, self.f_hat, self.extra_data) #FIXME: Shouldn't this have a implicit component aswell? - dlikelihood_dfhat = self.likelihood_function.link_grad(self.data, self.f_hat, self.extra_data) - KW_I_i, _, _, _ = pdinv(np.dot(self.K, self.W) + np.eye(self.N)) - #KW_I_i = self.Bi # could use self.B_chol?? - dfhat_dthetaL = mdot(KW_I_i, (self.K, dlikelihoodgrad_dthetaL)) - dfhat_dthetaL = np.zeros(dfhat_dthetaL.shape)[:, None] - - dytil_dthetaL = np.dot(dytil_dfhat, dfhat_dthetaL) - - #FIXME: Careful the -D*0.5 in dL_d_K_sigma might need to be -0.5? - dL_dSigma = np.diagflat(partial) #Is actually but can't rename it because of naming convention... dL_d_K_Sigma - - Wi = np.diagonal(self.Sigma_tilde) #Convenience - #-1 as we are looking at W which is -1*d2log p(y|f) - #Can just hadamard product as diagonal matricies multiplied are just multiplying elements - dSigma_dthetaL_explicit = np.diagflat(-1*(Wi*(-1*d2likelihood_dthetaL)*Wi)) - - d3likelihood_d3fhat = self.likelihood_function.d3link(self.data, self.f_hat, self.extra_data) - dWi_dfhat = np.diagflat(-1*Wi*(-1*d3likelihood_d3fhat)*Wi) - dSigma_dthetaL_implicit = np.dot(dWi_dfhat, dfhat_dthetaL) - dSigma_dthetaL = dSigma_dthetaL_explicit + dSigma_dthetaL_implicit - - #dSigmai_dthetaL = self.likelihood_function._gradients(self.data, self.f_hat, self.extra_data) #FIXME: Shouldn't this have a implicit component aswell? - #Derivative for each f dimension, for each of K's hyper parameters - #dSigma_dthetaL = np.empty((self.N, len(self.likelihood_function._get_param_names()))) - #for ind_l, dSigmai_dtheta_l in enumerate(dSigmai_dthetaL.T): - #dSigma_dthetaL[:, ind_l] = -mdot(self.Sigma_tilde, - #dSigmai_dtheta_l, # Careful, shouldn't this be (N, 1)? - #self.Sigma_tilde - #) - - #TODO: This is Wi*A*Wi, can be more numerically stable with a trick - #dSigma_dthetaL = -mdot(self.Sigma_tilde, dSigmai_dthetaL, self.Sigma_tilde) - - #dytil_dthetaL = dytil_dfhat*dfhat_dthetaL - #dytil_dthetaL = np.dot(dytil_dfhat, dfhat_dthetaL) - #dL_dthetaL = 0 + np.dot(dL_dytil, dytil_dthetaL)# + np.dot(dL_dSigma, dSigma_dthetaL) - - dL_dthetaL_via_ytil = np.sum(np.dot(dL_dytil, dytil_dthetaL), axis=0) - dL_dthetaL_via_Sigma = np.sum(np.sum(np.dot(dL_dSigma, dSigma_dthetaL), axis=0)) - dL_dthetaL = dL_dthetaL_via_ytil + dL_dthetaL_via_Sigma - - return np.squeeze(dL_dthetaL) #should be array of length *params-being optimized*, for student t just optimising 1 parameter, this is (1,) + return dL_dthetaL #should be array of length *params-being optimized*, for student t just optimising 1 parameter, this is (1,) def _compute_GP_variables(self): """ diff --git a/GPy/models/GP.py b/GPy/models/GP.py index 17e2a1b1..da379eb1 100644 --- a/GPy/models/GP.py +++ b/GPy/models/GP.py @@ -150,14 +150,8 @@ class GP(model): fake_dL_dKs = np.eye(self.dL_dK.shape[0]) #FIXME: Check this is right... dK_dthetaK = self.kern.dK_dtheta(dL_dK=fake_dL_dKs, X=self.X) - #We need the dL_dK where K is equal to the prior K, not K+Sigma as is the case now - dL_dthetaK_implicit = self.likelihood._Kgradients(dL_d_K_Sigma=self.dL_dK, dK_dthetaK=dK_dthetaK) - dL_dthetaK = dL_dthetaK_explicit + dL_dthetaK_implicit - - #print "dL_dthetaK_explicit: {dldkx} dL_dthetaK_implicit: {dldki} dL_dthetaK: {dldk}".format(dldkx=dL_dthetaK_explicit, dldki=dL_dthetaK_implicit, dldk=dL_dthetaK) - + dL_dthetaK = self.likelihood._Kgradients(dL_d_K_Sigma=self.dL_dK, dK_dthetaK=dK_dthetaK) dL_dthetaL = self.likelihood._gradients(partial=np.diag(self.dL_dK)) - #print "dL_dthetaL: ", dL_dthetaL print "Stacked dL_dthetaK, dL_dthetaL: ", np.hstack((dL_dthetaK, dL_dthetaL)) else: dL_dthetaL = self.likelihood._gradients(partial=np.diag(self.dL_dK))