Almost have likelihood gradients working but kernels still way off

This commit is contained in:
Alan Saul 2013-05-13 18:36:02 +01:00
parent 9500b12b53
commit 5472c5c6ba
4 changed files with 91 additions and 60 deletions

View file

@ -52,7 +52,7 @@ def debug_student_t_noise_approx():
real_sd = np.sqrt(real_var)
print "Real noise: ", real_sd
initial_var_guess = 0.01
initial_var_guess = 1
#t_rv = t(deg_free, loc=0, scale=real_var)
#noise = t_rvrvs(size=Y.shape)
#Y += noise
@ -84,14 +84,21 @@ def debug_student_t_noise_approx():
edited_real_sd = initial_var_guess #real_sd
import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
print "Clean student t, rasm"
t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd)
stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, rasm=True)
m = GPy.models.GP(X, stu_t_likelihood, kernel6)
m.ensure_default_constraints()
#m.constrain_positive('rbf')
m.constrain_fixed('rbf_v', 1.0898)
m.constrain_fixed('rbf_l', 1.8651)
m.constrain_positive('t_noi')
#m.constrain_fixed('t_noise_variance', real_sd)
m.update_likelihood_approximation()
m.optimize()
#m.optimize('lbfgsb', messages=True, callback=m._update_params_callback)
m.optimize('scg', messages=True)
print(m)
return m
if plot:
plt.suptitle('Student-t likelihood')
plt.subplot(132)
@ -99,19 +106,19 @@ def debug_student_t_noise_approx():
plt.plot(X_full, Y_full)
plt.ylim(-2.5, 2.5)
print "Clean student t, ncg"
t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd)
stu_t_likelihood = GPy.likelihoods.Laplace(Y, t_distribution, rasm=False)
m = GPy.models.GP(X, stu_t_likelihood, kernel3)
m.ensure_default_constraints()
m.update_likelihood_approximation()
m.optimize()
print(m)
if plot:
plt.subplot(133)
m.plot()
plt.plot(X_full, Y_full)
plt.ylim(-2.5, 2.5)
#print "Clean student t, ncg"
#t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd)
#stu_t_likelihood = GPy.likelihoods.Laplace(Y, t_distribution, rasm=False)
#m = GPy.models.GP(X, stu_t_likelihood, kernel3)
#m.ensure_default_constraints()
#m.update_likelihood_approximation()
#m.optimize()
#print(m)
#if plot:
#plt.subplot(133)
#m.plot()
#plt.plot(X_full, Y_full)
#plt.ylim(-2.5, 2.5)
#plt.show()

View file

@ -63,6 +63,7 @@ class Laplace(likelihood):
return self.likelihood_function._get_param_names()
def _set_params(self, p):
#print "Setting laplace param with: ", p
return self.likelihood_function._set_params(p)
def both_gradients(self, dL_d_K_Sigma, dK_dthetaK):
@ -78,10 +79,24 @@ class Laplace(likelihood):
return (self._Kgradients(dL_d_K_Sigma, dK_dthetaK), self._gradients(dL_d_K_Sigma))
def _shared_gradients_components(self):
dL_dytil = -np.dot(self.Y.T, (self.K+self.Sigma_tilde)) #or *0.5? Shouldn't this be -y*R
dytil_dfhat = self.Wi__Ki_W # np.dot(self.Sigma_tilde, self.Ki) + np.eye(self.N) # or self.Wi__Ki_W?
#Ki, _, _, _ = pdinv(self.K)
#dytil_dfhat = np.dot(self.Sigma_tilde, Ki) + np.eye(self.N) # or self.Wi__Ki_W?
dL_dytil = -np.dot(self.Y.T, inv(self.K+self.Sigma_tilde)) #or *0.5? Shouldn't this be -y*R
d3likelihood_d3fhat = self.likelihood_function.d3link(self.data, self.f_hat, self.extra_data)
Wi = np.diagonal(self.Sigma_tilde) #Convenience
#Can just hadamard product as diagonal matricies multiplied are just multiplying elements
dWi_dfhat = np.diagflat(-1*Wi*(-1*d3likelihood_d3fhat)*Wi)
Ki, _, _, _ = pdinv(self.K)
#dytil_dfhat_implicit = np.dot(dWi_dfhat, Ki) + np.eye(self.N)
#dytil_dfhat = np.dot(dWi_dfhat, Ki) + np.eye(self.N)
#Wi(Ki + W) = Wi__Ki_W using the last K prior given to fit_full
#dytil_dfhat_explicit = self.Wi__Ki_W
#dytil_dfhat = dytil_dfhat_explicit + dytil_dfhat_implicit
#dytil_dfhat1 = np.dot(self.Sigma_tilde, Ki) + np.eye(self.N) # or self.Wi__Ki_W? Theyre the same basically
a = mdot(dWi_dfhat, Ki, self.f_hat)
dytil_dfhat = mdot(dWi_dfhat, Ki, self.f_hat) + np.dot(self.Sigma_tilde, Ki) + np.eye(self.N)
return dL_dytil, dytil_dfhat
def _Kgradients(self, dL_d_K_Sigma, dK_dthetaK):
@ -94,18 +109,18 @@ class Laplace(likelihood):
"""
dL_dytil, dytil_dfhat = self._shared_gradients_components()
#dSigma_dfhat = -np.dot(self.Sigma_tilde, np.dot(d3phi_d3fhat, self.Sigma_tilde))
print "Computing K gradients"
print "dytil_dfhat: ", np.mean(dytil_dfhat)
I = np.eye(self.N)
C = np.dot(self.K, self.W)
A = I + C
#print "Computing K gradients"
#print "dytil_dfhat: ", np.mean(dytil_dfhat)
#I = np.eye(self.N)
#C = np.dot(self.K, self.W)
#A = I + C
#plt.imshow(A)
#plt.show()
#I_KW_i, _, _, _ = pdinv(A) #FIXME: WHY SO MUCH JITTER?!
#B = I + w12*K*w12
I_KW_i = self.Bi # could use self.B_chol??
#FIXME: Careful dK_dthetaK is not the derivative with respect to the marginal just prior K!
@ -113,15 +128,22 @@ class Laplace(likelihood):
dfhat_dthetaK = np.zeros((self.f_hat.shape[0], dK_dthetaK.shape[0]))
grad = self.likelihood_function.link_grad(self.data, self.f_hat, self.extra_data)
for ind_j, thetaj in enumerate(dK_dthetaK):
dfhat_dthetaK[:, ind_j] = np.dot(I_KW_i, np.dot(thetaj, grad))
#dfhat_dthetaK[:, ind_j] = np.dot(thetaj, grad) - np.dot(self.K, np.dot(I_KW_i, np.dot(thetaj, grad)))
dfhat_dthetaK[:, ind_j] = np.dot(I_KW_i, thetaj*grad)
print "dytil_dfhat: ", np.mean(dytil_dfhat), np.std(dytil_dfhat)
print "dfhat_dthetaK: ", np.mean(dfhat_dthetaK), np.std(dfhat_dthetaK)
dytil_dthetaK = np.dot(dytil_dfhat, dfhat_dthetaK) # should be (D,thetaK)
print "dytil_dthetaK: ", np.mean(dytil_dthetaK), np.std(dytil_dthetaK)
print "\n"
#FIXME: Careful the -D*0.5 in dL_d_K_sigma might need to be -0.5?
dL_dSigma = dL_d_K_Sigma
#d3phi_d3fhat = self.likelihood_function.d3link(self.data, self.f_hat, self.extra_data)
#explicit #implicit
#dSigmai_dthetaK = 0 + np.dot(d3phi_d3fhat, dfhat_dthetaK)
#dSigma_dthetaK = np.zeros((self.f_hat.shape[0], self.f_hat.shape[0], dK_dthetaK.shape[0]))
d3likelihood_d3fhat = self.likelihood_function.d3link(self.data, self.f_hat, self.extra_data)
Wi = np.diagonal(self.Sigma_tilde) #Convenience
dSigma_dthetaK_explicit = 0
@ -140,19 +162,16 @@ class Laplace(likelihood):
dL_dthetaK_via_ytil = np.sum(np.dot(dL_dytil, dytil_dthetaK), axis=0)
dL_dthetaK_via_Sigma = np.sum(np.dot(dL_dSigma, dSigma_dthetaK), axis=0)
dL_dthetaK_implicit = dL_dthetaK_via_ytil + dL_dthetaK_via_Sigma
#dL_dthetaK_implicit = np.dot(dL_dytil.T, dytil_dthetaK.T)
#print "\n"
#print "dL_dytil: ", np.mean(dL_dytil)
#print "dytil_dthetaK: ", np.mean(dytil_dthetaK)
#print "dL_dthetaK_via_ytil: ", dL_dthetaK_via_ytil
#print "\n"
#print "dL_dSigma: ", np.mean(dL_dSigma)
#print "dSigma_dthetaK: ", np.mean(dSigma_dthetaK)
#print "dL_dthetaK_via_Sigma: ", dL_dthetaK_via_Sigma
#print "\n"
#print "dL_dthetaK_implicit: ", dL_dthetaK_implicit
#import ipdb; ipdb.set_trace() # XXX BREAKPOINT
print "dL_dytil: ", np.mean(dL_dytil), np.std(dL_dytil)
print "dytil_dthetaK: ", np.mean(dytil_dthetaK), np.std(dytil_dthetaK)
print "dL_dthetaK_via_ytil: ", dL_dthetaK_via_ytil
print "\n"
print "dL_dSigma: ", np.mean(dL_dSigma), np.std(dL_dSigma)
print "dSigma_dthetaK: ", np.mean(dSigma_dthetaK), np.std(dSigma_dthetaK)
print "dL_dthetaK_via_Sigma: ", dL_dthetaK_via_Sigma
print "\n"
print "dL_dthetaK_implicit: ", dL_dthetaK_implicit
return np.squeeze(dL_dthetaK_implicit)
@ -182,11 +201,15 @@ class Laplace(likelihood):
dL_dytil, dytil_dfhat = self._shared_gradients_components()
#dfhat_dthetaL, dSigmai_dthetaL = self.likelihood_function._gradients(self.data, self.f_hat, self.extra_data) #FIXME: Shouldn't this have a implicit component aswell?
dlikelihood_dthetaL_explicit, d2likelihood_dthetaL = self.likelihood_function._gradients(self.data, self.f_hat, self.extra_data) #FIXME: Shouldn't this have a implicit component aswell?
dlikelihood_dfhat = self.likelihood_function.link_hess(self.data, self.f_hat, self.extra_data)
dfhat_dthetaL_cyclic = 0 #what is this? how can dfhat_dthetaL be used in the value of itself?
dlikelihood_dthetaL_implicit = np.dot(dlikelihood_dfhat, dfhat_dthetaL_cyclic) # may need a sum over f
dfhat_dthetaL = np.dot(self.K, (dlikelihood_dthetaL_explicit + dlikelihood_dthetaL_implicit)[:, None])
dlikelihood_dthetaL, d2likelihood_dthetaL = self.likelihood_function._gradients(self.data, self.f_hat, self.extra_data) #FIXME: Shouldn't this have a implicit component aswell?
dlikelihood_dfhat = self.likelihood_function.link_grad(self.data, self.f_hat, self.extra_data)
#dfhat_dthetaL_cyclic = 0 #FIXME: what is this? how can dfhat_dthetaL be used in the value of itself?
#dlikelihood_dthetaL_implicit = np.dot(dlikelihood_dfhat, dfhat_dthetaL_cyclic) # may need a sum over f
#dfhat_dthetaL = np.dot(self.K, (dlikelihood_dthetaL_explicit + dlikelihood_dthetaL_implicit)[:, None])
#KW_I_i, _, _, _ = pdinv(np.dot(self.K, self.W) + np.eye(self.N))
KW_I_i = self.Bi # could use self.B_chol??
dfhat_dthetaL = mdot(KW_I_i, (self.K, dlikelihood_dfhat))
dytil_dthetaL = np.dot(dytil_dfhat, dfhat_dthetaL)
#FIXME: Careful the -D*0.5 in dL_d_K_sigma might need to be -0.5?
@ -199,7 +222,7 @@ class Laplace(likelihood):
d3likelihood_d3fhat = self.likelihood_function.d3link(self.data, self.f_hat, self.extra_data)
dWi_dfhat = np.diagflat(-1*Wi*(-1*d3likelihood_d3fhat)*Wi)
dSigma_dthetaL_implicit = np.dot(dWi_dfhat, dfhat_dthetaL_cyclic)
dSigma_dthetaL_implicit = np.dot(dWi_dfhat, dfhat_dthetaL)
dSigma_dthetaL = dSigma_dthetaL_explicit + dSigma_dthetaL_implicit
#dSigmai_dthetaL = self.likelihood_function._gradients(self.data, self.f_hat, self.extra_data) #FIXME: Shouldn't this have a implicit component aswell?
@ -219,8 +242,10 @@ class Laplace(likelihood):
#dL_dthetaL = 0 + np.dot(dL_dytil, dytil_dthetaL)# + np.dot(dL_dSigma, dSigma_dthetaL)
dL_dthetaL_via_ytil = np.sum(np.dot(dL_dytil, dytil_dthetaL), axis=0)
dL_dthetaL_via_Sigma = np.sum(np.dot(dL_dSigma, dSigma_dthetaL), axis=0)
dL_dthetaL_via_Sigma = np.sum(np.dot(dL_dSigma[:, None].T, dSigma_dthetaL), axis=0)
dL_dthetaL = dL_dthetaL_via_ytil + dL_dthetaL_via_Sigma
dL_dthetaL_via_Sigma_old = np.sum(np.dot(dL_dSigma, dSigma_dthetaL), axis=0)
import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
return np.squeeze(dL_dthetaL) #should be array of length *params-being optimized*, for student t just optimising 1 parameter, this is (1,)
@ -257,7 +282,7 @@ class Laplace(likelihood):
#((L.T*w)_i + I)f_hat = y_tilde
L = jitchol(self.K)
Li = chol_inv(L)
Lt_W = np.dot(L.T, self.W)
Lt_W = np.dot(L.T, self.W) #FIXME: Can make Faster
##Check it isn't singular!
if cond(Lt_W) > epsilon:
@ -361,7 +386,6 @@ class Laplace(likelihood):
"""
#W is diagnoal so its sqrt is just the sqrt of the diagonal elements
W_12 = np.sqrt(W)
#import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
B = np.eye(K.shape[0]) + np.dot(W_12, np.dot(K, W_12))
L = jitchol(B)
return (B, L, W_12)

View file

@ -176,8 +176,6 @@ class student_t(likelihood_function):
def _set_params(self, x):
self.sigma = float(x)
print "Setting student t sigma: ", x
print x
#self.covariance_matrix = np.eye(self.N)*self._variance
#self.precision = 1./self._variance
@ -288,7 +286,7 @@ class student_t(likelihood_function):
f = np.squeeze(f)
assert y.shape == f.shape
e = y - f
grad_sigma = ( (-2*self.sigma*self.v*(self.v + 1)*e)
grad_sigma = ( (2*self.sigma*self.v*(self.v + 1)*e)
/ ((self.v*(self.sigma**2) + e**2)**2)
)
return np.squeeze(grad_sigma)

View file

@ -66,6 +66,10 @@ class GP(model):
# self.likelihood._set_params(p[self.kern.Nparam:]) # test by Nicolas
self.likelihood._set_params(p[self.kern.Nparam_transformed():]) # test by Nicolas
if isinstance(self.likelihood, Laplace):
print "Updating approx: ", p
self.likelihood.fit_full(self.kern.K(self.X))
self.likelihood._set_params(self.likelihood._get_params())
self.K = self.kern.K(self.X)
self.K += self.likelihood.covariance_matrix
@ -87,14 +91,12 @@ class GP(model):
return self.kern._get_param_names_transformed() + self.likelihood._get_param_names()
def _update_params_callback(self, p):
#FIXME:Check the transforming
#Set the new parameters of the kernel and likelihood within the optimization
import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
#parameters will be in transformed space
self.kern._set_params_transformed(p[:self.kern.Nparam_transformed()])
#set_params_transformed for likelihood doesn't exist?
self.likelihood._set_params(p[self.kern.Nparam_transformed():])
#update the likelihood approximation within the optimisation with the current parameters
self.update_likelihood_approximation()
import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
def update_likelihood_approximation(self):
"""
@ -123,7 +125,9 @@ class GP(model):
model for a new variable Y* = v_tilde/tau_tilde, with a covariance
matrix K* = K + diag(1./tau_tilde) plus a normalization term.
"""
return -0.5 * self.D * self.K_logdet + self._model_fit_term() + self.likelihood.Z
l = -0.5 * self.D * self.K_logdet + self._model_fit_term() + self.likelihood.Z
print "Log likelihood: ", l
return l
def _log_likelihood_gradients(self):
"""
@ -135,7 +139,7 @@ class GP(model):
if isinstance(self.likelihood, Laplace):
dL_dthetaK_explicit = dL_dthetaK
#Need to pass in a matrix of ones to get access to raw dK_dthetaK values without being chained
fake_dL_dKs = np.eye(self.dL_dK.shape[0])
fake_dL_dKs = np.eye(self.dL_dK.shape[0]) #FIXME: Check this is right...
dK_dthetaK = self.kern.dK_dtheta(dL_dK=fake_dL_dKs, X=self.X)
#We need the dL_dK where K is equal to the prior K, not K+Sigma as is the case now
@ -145,13 +149,11 @@ class GP(model):
#print "dL_dthetaK_explicit: {dldkx} dL_dthetaK_implicit: {dldki} dL_dthetaK: {dldk}".format(dldkx=dL_dthetaK_explicit, dldki=dL_dthetaK_implicit, dldk=dL_dthetaK)
dL_dthetaL = self.likelihood._gradients(partial=np.diag(self.dL_dK))
#print "dL_dthetaL: ", dL_dthetaL
print "Stacked dL_dthetaK, dL_dthetaL: ", np.hstack((dL_dthetaK, dL_dthetaL))
import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
else:
#print "dL_dthetaK: ", dL_dthetaK
dL_dthetaL = self.likelihood._gradients(partial=np.diag(self.dL_dK))
print "Stacked dL_dthetaK, dL_dthetaL: ", np.hstack((dL_dthetaK, dL_dthetaL))
#print "dL_dthetaL: ", dL_dthetaL
return np.hstack((dL_dthetaK, dL_dthetaL))
#return np.hstack((self.kern.dK_dtheta(dL_dK=self.dL_dK, X=self.X), self.likelihood._gradients(partial=np.diag(self.dL_dK))))