Ripped out all things Laplace parameter estimation, starting again with new tactic

This commit is contained in:
Alan Saul 2013-05-29 14:02:03 +01:00
parent d63d370641
commit 117c377d13
2 changed files with 4 additions and 179 deletions

View file

@ -79,187 +79,18 @@ class Laplace(likelihood):
return (self._Kgradients(dL_d_K_Sigma, dK_dthetaK), self._gradients(dL_d_K_Sigma))
def _shared_gradients_components(self):
Ki, _, _, _ = pdinv(self.K)
#Y__KS_i = np.dot(self.Y.T, inv(self.K+self.Sigma_tilde))
#dL_dytil = -0.5*Y__KS_i #or *0.5? Shouldn't this be -y*R
#dL_dytil = -0.5*np.trace(np.dot(inv(self.K+self.Sigma_tilde), (np.dot(self.Y, self.Y.T) + self.Y.T)))
#dL_dytil_simple_term = -0.5*np.dot(inv(self.K+self.Sigma_tilde),
#dL_dytil_simple_term = -np.dot(self.Y.T, inv(self.K+self.Sigma_tilde), self.Y)
c = inv(self.K+self.Sigma_tilde)
dL_dytil_simple_term = -0.5*np.diag(2*np.dot(c, self.Y))
P = np.diagflat(1/np.dot(Ki, self.f_hat))
K_Wi_i = inv(self.K+self.Sigma_tilde)
dL_dytil_difficult_term = np.diag(( -0.5*(np.dot(self.K + self.Sigma_tilde, P))
+0.5*mdot(K_Wi_i, self.Y, self.Y.T, K_Wi_i, P)
) * np.eye(self.N))
dL_dytil = dL_dytil_simple_term + dL_dytil_difficult_term
dL_dytil = dL_dytil.reshape(1, self.N)
d3likelihood_d3fhat = self.likelihood_function.d3link(self.data, self.f_hat, self.extra_data)
Wi = np.diagonal(self.Sigma_tilde) #Convenience
#Can just hadamard product as diagonal matricies multiplied are just multiplying elements
dWi_dfhat = np.diagflat(-1*Wi*(-1*d3likelihood_d3fhat)*Wi)
#Wi(Ki + W) = Wi__Ki_W using the last K prior given to fit_full
#dytil_dfhat_explicit = self.Wi__Ki_W
#dytil_dfhat = dytil_dfhat_explicit + dytil_dfhat_implicit
#dytil_dfhat1 = np.dot(self.Sigma_tilde, Ki) + np.eye(self.N) # or self.Wi__Ki_W? Theyre the same basically
dytil_dfhat = - np.diagflat(np.dot(dWi_dfhat, np.dot(Ki, self.f_hat))) + np.dot(self.Sigma_tilde, Ki) + np.eye(self.N)
self.dytil_dfhat = dytil_dfhat
#dytil_dfhat = np.eye(dytil_dfhat.shape[0])
self.dL_dfhat = np.dot(dL_dytil, dytil_dfhat) #FIXME: Purely for checkgradding....
return dL_dytil, dytil_dfhat
def _Kgradients(self, dL_d_K_Sigma, dK_dthetaK):
"""
#explicit #implicit #implicit
dL_dtheta_K = (dL_dK * dK_dthetaK) + (dL_dytil * dytil_dthetaK) + (dL_dSigma * dSigma_dthetaK)
:param dL_d_K_Sigma: Derivative of marginal with respect to K_prior+Sigma_tilde (posterior covariance)
:param dK_dthetaK: explcit derivative of kernel with respect to its hyper paramers
:returns: dL_dthetaK - gradients of marginal likelihood w.r.t changes in K hyperparameters
Gradients with respect to prior kernel parameters
"""
dL_dytil, dytil_dfhat = self._shared_gradients_components()
#dSigma_dfhat = -np.dot(self.Sigma_tilde, np.dot(d3phi_d3fhat, self.Sigma_tilde))
#print "Computing K gradients"
#print "dytil_dfhat: ", np.mean(dytil_dfhat)
#I = np.eye(self.N)
#C = np.dot(self.K, self.W)
#A = I + C
#plt.imshow(A)
#plt.show()
#I_KW_i, _, _, _ = pdinv(A) #FIXME: WHY SO MUCH JITTER?!
#B = I + w12*K*w12
I_KW_i = self.Bi # could use self.B_chol??
#FIXME: Careful dK_dthetaK is not the derivative with respect to the marginal just prior K!
#Derivative for each f dimension, for each of K's hyper parameters
dfhat_dthetaK = np.zeros((self.f_hat.shape[0], dK_dthetaK.shape[0]))
grad = self.likelihood_function.link_grad(self.data, self.f_hat, self.extra_data)
for ind_j, thetaj in enumerate(dK_dthetaK):
#dfhat_dthetaK[:, ind_j] = np.dot(thetaj, grad) - np.dot(self.K, np.dot(I_KW_i, np.dot(thetaj, grad)))
dfhat_dthetaK[:, ind_j] = np.dot(I_KW_i, thetaj*grad)
print "dytil_dfhat: ", np.mean(dytil_dfhat), np.std(dytil_dfhat)
print "dfhat_dthetaK: ", np.mean(dfhat_dthetaK), np.std(dfhat_dthetaK)
dytil_dthetaK = np.dot(dytil_dfhat, dfhat_dthetaK) # should be (D,thetaK)
print "dytil_dthetaK: ", np.mean(dytil_dthetaK), np.std(dytil_dthetaK)
print "\n"
#FIXME: Careful the -D*0.5 in dL_d_K_sigma might need to be -0.5?
dL_dSigma = dL_d_K_Sigma
#d3phi_d3fhat = self.likelihood_function.d3link(self.data, self.f_hat, self.extra_data)
#explicit #implicit
#dSigmai_dthetaK = 0 + np.dot(d3phi_d3fhat, dfhat_dthetaK)
#dSigma_dthetaK = np.zeros((self.f_hat.shape[0], self.f_hat.shape[0], dK_dthetaK.shape[0]))
d3likelihood_d3fhat = self.likelihood_function.d3link(self.data, self.f_hat, self.extra_data)
Wi = np.diagonal(self.Sigma_tilde) #Convenience
dSigma_dthetaK_explicit = 0
#Can just hadamard product as diagonal matricies multiplied are just multiplying elements
dWi_dfhat = np.diagflat(-1*Wi*(-1*d3likelihood_d3fhat)*Wi)
#dSigma_dthetaK_implicit = -np.sum(np.dot(dWi_dfhat, dfhat_dthetaK), axis=0)
dSigma_dthetaK_implicit = np.dot(dWi_dfhat, dfhat_dthetaK)
dSigma_dthetaK = dSigma_dthetaK_explicit + dSigma_dthetaK_implicit
#dSigma_dthetaK = 0 + np.dot(, dfhat_dthetaK)
#for ind_j, dSigmai_dthetaj in enumerate(dSigmai_dthetaK):
#dSigma_dthetaK_explicit = 0
#dSigma_dthetaK_implicit = -np.dot(Wi, dW_dfhat
#dSigma_dthetaK[:, :, ind_j] = -np.dot(self.Sigma_tilde, dSigmai_dthetaj*self.Sigma_tilde)
#FIXME: Won't handle multi dimensional data
dL_dthetaK_via_ytil = np.sum(np.dot(dL_dytil, dytil_dthetaK), axis=0)
dL_dthetaK_via_Sigma = np.sum(np.dot(dL_dSigma, dSigma_dthetaK), axis=0)
dL_dthetaK_implicit = dL_dthetaK_via_ytil + dL_dthetaK_via_Sigma
print "dL_dytil: ", np.mean(dL_dytil), np.std(dL_dytil)
print "dytil_dthetaK: ", np.mean(dytil_dthetaK), np.std(dytil_dthetaK)
print "dL_dthetaK_via_ytil: ", dL_dthetaK_via_ytil
print "\n"
print "dL_dSigma: ", np.mean(dL_dSigma), np.std(dL_dSigma)
print "dSigma_dthetaK: ", np.mean(dSigma_dthetaK), np.std(dSigma_dthetaK)
print "dL_dthetaK_via_Sigma: ", dL_dthetaK_via_Sigma
print "\n"
print "dL_dthetaK_implicit: ", dL_dthetaK_implicit
return np.squeeze(dL_dthetaK_implicit)
return dL_dthetaK
def _gradients(self, partial):
"""
Gradients with respect to likelihood parameters
Complicated, it differs for parameters of the kernel \theta_{K}, and
parameters of the likelihood, \theta_{L}
dL_dtheta_K = (dL_dK * dK_dthetaK) + (dL_dytil * dytil_dthetaK) + (dL_dSigma * dSigma_dthetaK)
dL_dtheta_L = (dL_dK * dK_dthetaL) + (dL_dytil * dytil_dthetaL) + (dL_dSigma * dSigma_dthetaL)
dL_dK*dK_dthetaL = 0
dytil_dthetaX = dytil_dfhat * dfhat_dthetaX
dytil_dfhat = Sigma*Ki + I
fhat = K*log_p(y|fhat) from rasm p125
dfhat_dthetaK = (I + KW)i * dK_dthetaK * log_p(y|fhat) from rasm p125
dSigma_dthetaX = dWi_dthetaX = -Wi * dW_dthetaX * Wi
dW_dthetaX = d_dthetaX[d2phi_d2fhat]
d2phi_d2fhat = Hessian function of likelihood
partial = dL_d_K_Sigma
"""
dL_dytil, dytil_dfhat = self._shared_gradients_components()
#dfhat_dthetaL, dSigmai_dthetaL = self.likelihood_function._gradients(self.data, self.f_hat, self.extra_data) #FIXME: Shouldn't this have a implicit component aswell?
dlikelihoodgrad_dthetaL, d2likelihood_dthetaL = self.likelihood_function._gradients(self.data, self.f_hat, self.extra_data) #FIXME: Shouldn't this have a implicit component aswell?
dlikelihood_dfhat = self.likelihood_function.link_grad(self.data, self.f_hat, self.extra_data)
KW_I_i, _, _, _ = pdinv(np.dot(self.K, self.W) + np.eye(self.N))
#KW_I_i = self.Bi # could use self.B_chol??
dfhat_dthetaL = mdot(KW_I_i, (self.K, dlikelihoodgrad_dthetaL))
dfhat_dthetaL = np.zeros(dfhat_dthetaL.shape)[:, None]
dytil_dthetaL = np.dot(dytil_dfhat, dfhat_dthetaL)
#FIXME: Careful the -D*0.5 in dL_d_K_sigma might need to be -0.5?
dL_dSigma = np.diagflat(partial) #Is actually but can't rename it because of naming convention... dL_d_K_Sigma
Wi = np.diagonal(self.Sigma_tilde) #Convenience
#-1 as we are looking at W which is -1*d2log p(y|f)
#Can just hadamard product as diagonal matricies multiplied are just multiplying elements
dSigma_dthetaL_explicit = np.diagflat(-1*(Wi*(-1*d2likelihood_dthetaL)*Wi))
d3likelihood_d3fhat = self.likelihood_function.d3link(self.data, self.f_hat, self.extra_data)
dWi_dfhat = np.diagflat(-1*Wi*(-1*d3likelihood_d3fhat)*Wi)
dSigma_dthetaL_implicit = np.dot(dWi_dfhat, dfhat_dthetaL)
dSigma_dthetaL = dSigma_dthetaL_explicit + dSigma_dthetaL_implicit
#dSigmai_dthetaL = self.likelihood_function._gradients(self.data, self.f_hat, self.extra_data) #FIXME: Shouldn't this have a implicit component aswell?
#Derivative for each f dimension, for each of K's hyper parameters
#dSigma_dthetaL = np.empty((self.N, len(self.likelihood_function._get_param_names())))
#for ind_l, dSigmai_dtheta_l in enumerate(dSigmai_dthetaL.T):
#dSigma_dthetaL[:, ind_l] = -mdot(self.Sigma_tilde,
#dSigmai_dtheta_l, # Careful, shouldn't this be (N, 1)?
#self.Sigma_tilde
#)
#TODO: This is Wi*A*Wi, can be more numerically stable with a trick
#dSigma_dthetaL = -mdot(self.Sigma_tilde, dSigmai_dthetaL, self.Sigma_tilde)
#dytil_dthetaL = dytil_dfhat*dfhat_dthetaL
#dytil_dthetaL = np.dot(dytil_dfhat, dfhat_dthetaL)
#dL_dthetaL = 0 + np.dot(dL_dytil, dytil_dthetaL)# + np.dot(dL_dSigma, dSigma_dthetaL)
dL_dthetaL_via_ytil = np.sum(np.dot(dL_dytil, dytil_dthetaL), axis=0)
dL_dthetaL_via_Sigma = np.sum(np.sum(np.dot(dL_dSigma, dSigma_dthetaL), axis=0))
dL_dthetaL = dL_dthetaL_via_ytil + dL_dthetaL_via_Sigma
return np.squeeze(dL_dthetaL) #should be array of length *params-being optimized*, for student t just optimising 1 parameter, this is (1,)
return dL_dthetaL #should be array of length *params-being optimized*, for student t just optimising 1 parameter, this is (1,)
def _compute_GP_variables(self):
"""

View file

@ -150,14 +150,8 @@ class GP(model):
fake_dL_dKs = np.eye(self.dL_dK.shape[0]) #FIXME: Check this is right...
dK_dthetaK = self.kern.dK_dtheta(dL_dK=fake_dL_dKs, X=self.X)
#We need the dL_dK where K is equal to the prior K, not K+Sigma as is the case now
dL_dthetaK_implicit = self.likelihood._Kgradients(dL_d_K_Sigma=self.dL_dK, dK_dthetaK=dK_dthetaK)
dL_dthetaK = dL_dthetaK_explicit + dL_dthetaK_implicit
#print "dL_dthetaK_explicit: {dldkx} dL_dthetaK_implicit: {dldki} dL_dthetaK: {dldk}".format(dldkx=dL_dthetaK_explicit, dldki=dL_dthetaK_implicit, dldk=dL_dthetaK)
dL_dthetaK = self.likelihood._Kgradients(dL_d_K_Sigma=self.dL_dK, dK_dthetaK=dK_dthetaK)
dL_dthetaL = self.likelihood._gradients(partial=np.diag(self.dL_dK))
#print "dL_dthetaL: ", dL_dthetaL
print "Stacked dL_dthetaK, dL_dthetaL: ", np.hstack((dL_dthetaK, dL_dthetaL))
else:
dL_dthetaL = self.likelihood._gradients(partial=np.diag(self.dL_dK))