mirror of
https://github.com/SheffieldML/GPy.git
synced 2026-05-18 13:55:14 +02:00
Ripped out all things Laplace parameter estimation, starting again with new tactic
This commit is contained in:
parent
d63d370641
commit
117c377d13
2 changed files with 4 additions and 179 deletions
|
|
@ -79,187 +79,18 @@ class Laplace(likelihood):
|
||||||
return (self._Kgradients(dL_d_K_Sigma, dK_dthetaK), self._gradients(dL_d_K_Sigma))
|
return (self._Kgradients(dL_d_K_Sigma, dK_dthetaK), self._gradients(dL_d_K_Sigma))
|
||||||
|
|
||||||
def _shared_gradients_components(self):
|
def _shared_gradients_components(self):
|
||||||
Ki, _, _, _ = pdinv(self.K)
|
|
||||||
|
|
||||||
#Y__KS_i = np.dot(self.Y.T, inv(self.K+self.Sigma_tilde))
|
|
||||||
#dL_dytil = -0.5*Y__KS_i #or *0.5? Shouldn't this be -y*R
|
|
||||||
#dL_dytil = -0.5*np.trace(np.dot(inv(self.K+self.Sigma_tilde), (np.dot(self.Y, self.Y.T) + self.Y.T)))
|
|
||||||
#dL_dytil_simple_term = -0.5*np.dot(inv(self.K+self.Sigma_tilde),
|
|
||||||
#dL_dytil_simple_term = -np.dot(self.Y.T, inv(self.K+self.Sigma_tilde), self.Y)
|
|
||||||
c = inv(self.K+self.Sigma_tilde)
|
|
||||||
dL_dytil_simple_term = -0.5*np.diag(2*np.dot(c, self.Y))
|
|
||||||
|
|
||||||
P = np.diagflat(1/np.dot(Ki, self.f_hat))
|
|
||||||
K_Wi_i = inv(self.K+self.Sigma_tilde)
|
|
||||||
|
|
||||||
dL_dytil_difficult_term = np.diag(( -0.5*(np.dot(self.K + self.Sigma_tilde, P))
|
|
||||||
+0.5*mdot(K_Wi_i, self.Y, self.Y.T, K_Wi_i, P)
|
|
||||||
) * np.eye(self.N))
|
|
||||||
dL_dytil = dL_dytil_simple_term + dL_dytil_difficult_term
|
|
||||||
dL_dytil = dL_dytil.reshape(1, self.N)
|
|
||||||
|
|
||||||
d3likelihood_d3fhat = self.likelihood_function.d3link(self.data, self.f_hat, self.extra_data)
|
|
||||||
Wi = np.diagonal(self.Sigma_tilde) #Convenience
|
|
||||||
#Can just hadamard product as diagonal matricies multiplied are just multiplying elements
|
|
||||||
dWi_dfhat = np.diagflat(-1*Wi*(-1*d3likelihood_d3fhat)*Wi)
|
|
||||||
|
|
||||||
|
|
||||||
#Wi(Ki + W) = Wi__Ki_W using the last K prior given to fit_full
|
|
||||||
#dytil_dfhat_explicit = self.Wi__Ki_W
|
|
||||||
#dytil_dfhat = dytil_dfhat_explicit + dytil_dfhat_implicit
|
|
||||||
#dytil_dfhat1 = np.dot(self.Sigma_tilde, Ki) + np.eye(self.N) # or self.Wi__Ki_W? Theyre the same basically
|
|
||||||
|
|
||||||
dytil_dfhat = - np.diagflat(np.dot(dWi_dfhat, np.dot(Ki, self.f_hat))) + np.dot(self.Sigma_tilde, Ki) + np.eye(self.N)
|
|
||||||
self.dytil_dfhat = dytil_dfhat
|
|
||||||
#dytil_dfhat = np.eye(dytil_dfhat.shape[0])
|
|
||||||
self.dL_dfhat = np.dot(dL_dytil, dytil_dfhat) #FIXME: Purely for checkgradding....
|
|
||||||
return dL_dytil, dytil_dfhat
|
|
||||||
|
|
||||||
def _Kgradients(self, dL_d_K_Sigma, dK_dthetaK):
|
def _Kgradients(self, dL_d_K_Sigma, dK_dthetaK):
|
||||||
"""
|
"""
|
||||||
#explicit #implicit #implicit
|
Gradients with respect to prior kernel parameters
|
||||||
dL_dtheta_K = (dL_dK * dK_dthetaK) + (dL_dytil * dytil_dthetaK) + (dL_dSigma * dSigma_dthetaK)
|
|
||||||
:param dL_d_K_Sigma: Derivative of marginal with respect to K_prior+Sigma_tilde (posterior covariance)
|
|
||||||
:param dK_dthetaK: explcit derivative of kernel with respect to its hyper paramers
|
|
||||||
:returns: dL_dthetaK - gradients of marginal likelihood w.r.t changes in K hyperparameters
|
|
||||||
"""
|
"""
|
||||||
dL_dytil, dytil_dfhat = self._shared_gradients_components()
|
return dL_dthetaK
|
||||||
|
|
||||||
#dSigma_dfhat = -np.dot(self.Sigma_tilde, np.dot(d3phi_d3fhat, self.Sigma_tilde))
|
|
||||||
|
|
||||||
#print "Computing K gradients"
|
|
||||||
#print "dytil_dfhat: ", np.mean(dytil_dfhat)
|
|
||||||
#I = np.eye(self.N)
|
|
||||||
#C = np.dot(self.K, self.W)
|
|
||||||
#A = I + C
|
|
||||||
#plt.imshow(A)
|
|
||||||
#plt.show()
|
|
||||||
|
|
||||||
#I_KW_i, _, _, _ = pdinv(A) #FIXME: WHY SO MUCH JITTER?!
|
|
||||||
#B = I + w12*K*w12
|
|
||||||
I_KW_i = self.Bi # could use self.B_chol??
|
|
||||||
|
|
||||||
#FIXME: Careful dK_dthetaK is not the derivative with respect to the marginal just prior K!
|
|
||||||
#Derivative for each f dimension, for each of K's hyper parameters
|
|
||||||
dfhat_dthetaK = np.zeros((self.f_hat.shape[0], dK_dthetaK.shape[0]))
|
|
||||||
grad = self.likelihood_function.link_grad(self.data, self.f_hat, self.extra_data)
|
|
||||||
for ind_j, thetaj in enumerate(dK_dthetaK):
|
|
||||||
#dfhat_dthetaK[:, ind_j] = np.dot(thetaj, grad) - np.dot(self.K, np.dot(I_KW_i, np.dot(thetaj, grad)))
|
|
||||||
dfhat_dthetaK[:, ind_j] = np.dot(I_KW_i, thetaj*grad)
|
|
||||||
|
|
||||||
print "dytil_dfhat: ", np.mean(dytil_dfhat), np.std(dytil_dfhat)
|
|
||||||
print "dfhat_dthetaK: ", np.mean(dfhat_dthetaK), np.std(dfhat_dthetaK)
|
|
||||||
dytil_dthetaK = np.dot(dytil_dfhat, dfhat_dthetaK) # should be (D,thetaK)
|
|
||||||
print "dytil_dthetaK: ", np.mean(dytil_dthetaK), np.std(dytil_dthetaK)
|
|
||||||
print "\n"
|
|
||||||
|
|
||||||
#FIXME: Careful the -D*0.5 in dL_d_K_sigma might need to be -0.5?
|
|
||||||
dL_dSigma = dL_d_K_Sigma
|
|
||||||
#d3phi_d3fhat = self.likelihood_function.d3link(self.data, self.f_hat, self.extra_data)
|
|
||||||
#explicit #implicit
|
|
||||||
#dSigmai_dthetaK = 0 + np.dot(d3phi_d3fhat, dfhat_dthetaK)
|
|
||||||
#dSigma_dthetaK = np.zeros((self.f_hat.shape[0], self.f_hat.shape[0], dK_dthetaK.shape[0]))
|
|
||||||
|
|
||||||
d3likelihood_d3fhat = self.likelihood_function.d3link(self.data, self.f_hat, self.extra_data)
|
|
||||||
Wi = np.diagonal(self.Sigma_tilde) #Convenience
|
|
||||||
dSigma_dthetaK_explicit = 0
|
|
||||||
#Can just hadamard product as diagonal matricies multiplied are just multiplying elements
|
|
||||||
dWi_dfhat = np.diagflat(-1*Wi*(-1*d3likelihood_d3fhat)*Wi)
|
|
||||||
#dSigma_dthetaK_implicit = -np.sum(np.dot(dWi_dfhat, dfhat_dthetaK), axis=0)
|
|
||||||
dSigma_dthetaK_implicit = np.dot(dWi_dfhat, dfhat_dthetaK)
|
|
||||||
dSigma_dthetaK = dSigma_dthetaK_explicit + dSigma_dthetaK_implicit
|
|
||||||
#dSigma_dthetaK = 0 + np.dot(, dfhat_dthetaK)
|
|
||||||
#for ind_j, dSigmai_dthetaj in enumerate(dSigmai_dthetaK):
|
|
||||||
#dSigma_dthetaK_explicit = 0
|
|
||||||
#dSigma_dthetaK_implicit = -np.dot(Wi, dW_dfhat
|
|
||||||
#dSigma_dthetaK[:, :, ind_j] = -np.dot(self.Sigma_tilde, dSigmai_dthetaj*self.Sigma_tilde)
|
|
||||||
|
|
||||||
#FIXME: Won't handle multi dimensional data
|
|
||||||
dL_dthetaK_via_ytil = np.sum(np.dot(dL_dytil, dytil_dthetaK), axis=0)
|
|
||||||
dL_dthetaK_via_Sigma = np.sum(np.dot(dL_dSigma, dSigma_dthetaK), axis=0)
|
|
||||||
dL_dthetaK_implicit = dL_dthetaK_via_ytil + dL_dthetaK_via_Sigma
|
|
||||||
|
|
||||||
print "dL_dytil: ", np.mean(dL_dytil), np.std(dL_dytil)
|
|
||||||
print "dytil_dthetaK: ", np.mean(dytil_dthetaK), np.std(dytil_dthetaK)
|
|
||||||
print "dL_dthetaK_via_ytil: ", dL_dthetaK_via_ytil
|
|
||||||
print "\n"
|
|
||||||
print "dL_dSigma: ", np.mean(dL_dSigma), np.std(dL_dSigma)
|
|
||||||
print "dSigma_dthetaK: ", np.mean(dSigma_dthetaK), np.std(dSigma_dthetaK)
|
|
||||||
print "dL_dthetaK_via_Sigma: ", dL_dthetaK_via_Sigma
|
|
||||||
print "\n"
|
|
||||||
print "dL_dthetaK_implicit: ", dL_dthetaK_implicit
|
|
||||||
|
|
||||||
return np.squeeze(dL_dthetaK_implicit)
|
|
||||||
|
|
||||||
def _gradients(self, partial):
|
def _gradients(self, partial):
|
||||||
"""
|
"""
|
||||||
Gradients with respect to likelihood parameters
|
Gradients with respect to likelihood parameters
|
||||||
|
|
||||||
Complicated, it differs for parameters of the kernel \theta_{K}, and
|
|
||||||
parameters of the likelihood, \theta_{L}
|
|
||||||
|
|
||||||
dL_dtheta_K = (dL_dK * dK_dthetaK) + (dL_dytil * dytil_dthetaK) + (dL_dSigma * dSigma_dthetaK)
|
|
||||||
dL_dtheta_L = (dL_dK * dK_dthetaL) + (dL_dytil * dytil_dthetaL) + (dL_dSigma * dSigma_dthetaL)
|
|
||||||
dL_dK*dK_dthetaL = 0
|
|
||||||
|
|
||||||
dytil_dthetaX = dytil_dfhat * dfhat_dthetaX
|
|
||||||
dytil_dfhat = Sigma*Ki + I
|
|
||||||
|
|
||||||
fhat = K*log_p(y|fhat) from rasm p125
|
|
||||||
dfhat_dthetaK = (I + KW)i * dK_dthetaK * log_p(y|fhat) from rasm p125
|
|
||||||
|
|
||||||
dSigma_dthetaX = dWi_dthetaX = -Wi * dW_dthetaX * Wi
|
|
||||||
dW_dthetaX = d_dthetaX[d2phi_d2fhat]
|
|
||||||
d2phi_d2fhat = Hessian function of likelihood
|
|
||||||
|
|
||||||
partial = dL_d_K_Sigma
|
|
||||||
"""
|
"""
|
||||||
dL_dytil, dytil_dfhat = self._shared_gradients_components()
|
return dL_dthetaL #should be array of length *params-being optimized*, for student t just optimising 1 parameter, this is (1,)
|
||||||
#dfhat_dthetaL, dSigmai_dthetaL = self.likelihood_function._gradients(self.data, self.f_hat, self.extra_data) #FIXME: Shouldn't this have a implicit component aswell?
|
|
||||||
|
|
||||||
dlikelihoodgrad_dthetaL, d2likelihood_dthetaL = self.likelihood_function._gradients(self.data, self.f_hat, self.extra_data) #FIXME: Shouldn't this have a implicit component aswell?
|
|
||||||
dlikelihood_dfhat = self.likelihood_function.link_grad(self.data, self.f_hat, self.extra_data)
|
|
||||||
KW_I_i, _, _, _ = pdinv(np.dot(self.K, self.W) + np.eye(self.N))
|
|
||||||
#KW_I_i = self.Bi # could use self.B_chol??
|
|
||||||
dfhat_dthetaL = mdot(KW_I_i, (self.K, dlikelihoodgrad_dthetaL))
|
|
||||||
dfhat_dthetaL = np.zeros(dfhat_dthetaL.shape)[:, None]
|
|
||||||
|
|
||||||
dytil_dthetaL = np.dot(dytil_dfhat, dfhat_dthetaL)
|
|
||||||
|
|
||||||
#FIXME: Careful the -D*0.5 in dL_d_K_sigma might need to be -0.5?
|
|
||||||
dL_dSigma = np.diagflat(partial) #Is actually but can't rename it because of naming convention... dL_d_K_Sigma
|
|
||||||
|
|
||||||
Wi = np.diagonal(self.Sigma_tilde) #Convenience
|
|
||||||
#-1 as we are looking at W which is -1*d2log p(y|f)
|
|
||||||
#Can just hadamard product as diagonal matricies multiplied are just multiplying elements
|
|
||||||
dSigma_dthetaL_explicit = np.diagflat(-1*(Wi*(-1*d2likelihood_dthetaL)*Wi))
|
|
||||||
|
|
||||||
d3likelihood_d3fhat = self.likelihood_function.d3link(self.data, self.f_hat, self.extra_data)
|
|
||||||
dWi_dfhat = np.diagflat(-1*Wi*(-1*d3likelihood_d3fhat)*Wi)
|
|
||||||
dSigma_dthetaL_implicit = np.dot(dWi_dfhat, dfhat_dthetaL)
|
|
||||||
dSigma_dthetaL = dSigma_dthetaL_explicit + dSigma_dthetaL_implicit
|
|
||||||
|
|
||||||
#dSigmai_dthetaL = self.likelihood_function._gradients(self.data, self.f_hat, self.extra_data) #FIXME: Shouldn't this have a implicit component aswell?
|
|
||||||
#Derivative for each f dimension, for each of K's hyper parameters
|
|
||||||
#dSigma_dthetaL = np.empty((self.N, len(self.likelihood_function._get_param_names())))
|
|
||||||
#for ind_l, dSigmai_dtheta_l in enumerate(dSigmai_dthetaL.T):
|
|
||||||
#dSigma_dthetaL[:, ind_l] = -mdot(self.Sigma_tilde,
|
|
||||||
#dSigmai_dtheta_l, # Careful, shouldn't this be (N, 1)?
|
|
||||||
#self.Sigma_tilde
|
|
||||||
#)
|
|
||||||
|
|
||||||
#TODO: This is Wi*A*Wi, can be more numerically stable with a trick
|
|
||||||
#dSigma_dthetaL = -mdot(self.Sigma_tilde, dSigmai_dthetaL, self.Sigma_tilde)
|
|
||||||
|
|
||||||
#dytil_dthetaL = dytil_dfhat*dfhat_dthetaL
|
|
||||||
#dytil_dthetaL = np.dot(dytil_dfhat, dfhat_dthetaL)
|
|
||||||
#dL_dthetaL = 0 + np.dot(dL_dytil, dytil_dthetaL)# + np.dot(dL_dSigma, dSigma_dthetaL)
|
|
||||||
|
|
||||||
dL_dthetaL_via_ytil = np.sum(np.dot(dL_dytil, dytil_dthetaL), axis=0)
|
|
||||||
dL_dthetaL_via_Sigma = np.sum(np.sum(np.dot(dL_dSigma, dSigma_dthetaL), axis=0))
|
|
||||||
dL_dthetaL = dL_dthetaL_via_ytil + dL_dthetaL_via_Sigma
|
|
||||||
|
|
||||||
return np.squeeze(dL_dthetaL) #should be array of length *params-being optimized*, for student t just optimising 1 parameter, this is (1,)
|
|
||||||
|
|
||||||
def _compute_GP_variables(self):
|
def _compute_GP_variables(self):
|
||||||
"""
|
"""
|
||||||
|
|
|
||||||
|
|
@ -150,14 +150,8 @@ class GP(model):
|
||||||
fake_dL_dKs = np.eye(self.dL_dK.shape[0]) #FIXME: Check this is right...
|
fake_dL_dKs = np.eye(self.dL_dK.shape[0]) #FIXME: Check this is right...
|
||||||
dK_dthetaK = self.kern.dK_dtheta(dL_dK=fake_dL_dKs, X=self.X)
|
dK_dthetaK = self.kern.dK_dtheta(dL_dK=fake_dL_dKs, X=self.X)
|
||||||
|
|
||||||
#We need the dL_dK where K is equal to the prior K, not K+Sigma as is the case now
|
dL_dthetaK = self.likelihood._Kgradients(dL_d_K_Sigma=self.dL_dK, dK_dthetaK=dK_dthetaK)
|
||||||
dL_dthetaK_implicit = self.likelihood._Kgradients(dL_d_K_Sigma=self.dL_dK, dK_dthetaK=dK_dthetaK)
|
|
||||||
dL_dthetaK = dL_dthetaK_explicit + dL_dthetaK_implicit
|
|
||||||
|
|
||||||
#print "dL_dthetaK_explicit: {dldkx} dL_dthetaK_implicit: {dldki} dL_dthetaK: {dldk}".format(dldkx=dL_dthetaK_explicit, dldki=dL_dthetaK_implicit, dldk=dL_dthetaK)
|
|
||||||
|
|
||||||
dL_dthetaL = self.likelihood._gradients(partial=np.diag(self.dL_dK))
|
dL_dthetaL = self.likelihood._gradients(partial=np.diag(self.dL_dK))
|
||||||
#print "dL_dthetaL: ", dL_dthetaL
|
|
||||||
print "Stacked dL_dthetaK, dL_dthetaL: ", np.hstack((dL_dthetaK, dL_dthetaL))
|
print "Stacked dL_dthetaK, dL_dthetaL: ", np.hstack((dL_dthetaK, dL_dthetaL))
|
||||||
else:
|
else:
|
||||||
dL_dthetaL = self.likelihood._gradients(partial=np.diag(self.dL_dK))
|
dL_dthetaL = self.likelihood._gradients(partial=np.diag(self.dL_dK))
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue