mirror of
https://github.com/SheffieldML/GPy.git
synced 2026-05-18 13:55:14 +02:00
Almost have likelihood gradients working but kernels still way off
This commit is contained in:
parent
9500b12b53
commit
5472c5c6ba
4 changed files with 91 additions and 60 deletions
|
|
@ -52,7 +52,7 @@ def debug_student_t_noise_approx():
|
|||
real_sd = np.sqrt(real_var)
|
||||
print "Real noise: ", real_sd
|
||||
|
||||
initial_var_guess = 0.01
|
||||
initial_var_guess = 1
|
||||
#t_rv = t(deg_free, loc=0, scale=real_var)
|
||||
#noise = t_rvrvs(size=Y.shape)
|
||||
#Y += noise
|
||||
|
|
@ -84,14 +84,21 @@ def debug_student_t_noise_approx():
|
|||
|
||||
edited_real_sd = initial_var_guess #real_sd
|
||||
|
||||
import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
|
||||
print "Clean student t, rasm"
|
||||
t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd)
|
||||
stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, rasm=True)
|
||||
m = GPy.models.GP(X, stu_t_likelihood, kernel6)
|
||||
m.ensure_default_constraints()
|
||||
#m.constrain_positive('rbf')
|
||||
m.constrain_fixed('rbf_v', 1.0898)
|
||||
m.constrain_fixed('rbf_l', 1.8651)
|
||||
m.constrain_positive('t_noi')
|
||||
#m.constrain_fixed('t_noise_variance', real_sd)
|
||||
m.update_likelihood_approximation()
|
||||
m.optimize()
|
||||
#m.optimize('lbfgsb', messages=True, callback=m._update_params_callback)
|
||||
m.optimize('scg', messages=True)
|
||||
print(m)
|
||||
return m
|
||||
if plot:
|
||||
plt.suptitle('Student-t likelihood')
|
||||
plt.subplot(132)
|
||||
|
|
@ -99,19 +106,19 @@ def debug_student_t_noise_approx():
|
|||
plt.plot(X_full, Y_full)
|
||||
plt.ylim(-2.5, 2.5)
|
||||
|
||||
print "Clean student t, ncg"
|
||||
t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd)
|
||||
stu_t_likelihood = GPy.likelihoods.Laplace(Y, t_distribution, rasm=False)
|
||||
m = GPy.models.GP(X, stu_t_likelihood, kernel3)
|
||||
m.ensure_default_constraints()
|
||||
m.update_likelihood_approximation()
|
||||
m.optimize()
|
||||
print(m)
|
||||
if plot:
|
||||
plt.subplot(133)
|
||||
m.plot()
|
||||
plt.plot(X_full, Y_full)
|
||||
plt.ylim(-2.5, 2.5)
|
||||
#print "Clean student t, ncg"
|
||||
#t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd)
|
||||
#stu_t_likelihood = GPy.likelihoods.Laplace(Y, t_distribution, rasm=False)
|
||||
#m = GPy.models.GP(X, stu_t_likelihood, kernel3)
|
||||
#m.ensure_default_constraints()
|
||||
#m.update_likelihood_approximation()
|
||||
#m.optimize()
|
||||
#print(m)
|
||||
#if plot:
|
||||
#plt.subplot(133)
|
||||
#m.plot()
|
||||
#plt.plot(X_full, Y_full)
|
||||
#plt.ylim(-2.5, 2.5)
|
||||
|
||||
#plt.show()
|
||||
|
||||
|
|
|
|||
|
|
@ -63,6 +63,7 @@ class Laplace(likelihood):
|
|||
return self.likelihood_function._get_param_names()
|
||||
|
||||
def _set_params(self, p):
|
||||
#print "Setting laplace param with: ", p
|
||||
return self.likelihood_function._set_params(p)
|
||||
|
||||
def both_gradients(self, dL_d_K_Sigma, dK_dthetaK):
|
||||
|
|
@ -78,10 +79,24 @@ class Laplace(likelihood):
|
|||
return (self._Kgradients(dL_d_K_Sigma, dK_dthetaK), self._gradients(dL_d_K_Sigma))
|
||||
|
||||
def _shared_gradients_components(self):
|
||||
dL_dytil = -np.dot(self.Y.T, (self.K+self.Sigma_tilde)) #or *0.5? Shouldn't this be -y*R
|
||||
dytil_dfhat = self.Wi__Ki_W # np.dot(self.Sigma_tilde, self.Ki) + np.eye(self.N) # or self.Wi__Ki_W?
|
||||
#Ki, _, _, _ = pdinv(self.K)
|
||||
#dytil_dfhat = np.dot(self.Sigma_tilde, Ki) + np.eye(self.N) # or self.Wi__Ki_W?
|
||||
dL_dytil = -np.dot(self.Y.T, inv(self.K+self.Sigma_tilde)) #or *0.5? Shouldn't this be -y*R
|
||||
|
||||
d3likelihood_d3fhat = self.likelihood_function.d3link(self.data, self.f_hat, self.extra_data)
|
||||
Wi = np.diagonal(self.Sigma_tilde) #Convenience
|
||||
#Can just hadamard product as diagonal matricies multiplied are just multiplying elements
|
||||
dWi_dfhat = np.diagflat(-1*Wi*(-1*d3likelihood_d3fhat)*Wi)
|
||||
|
||||
Ki, _, _, _ = pdinv(self.K)
|
||||
#dytil_dfhat_implicit = np.dot(dWi_dfhat, Ki) + np.eye(self.N)
|
||||
#dytil_dfhat = np.dot(dWi_dfhat, Ki) + np.eye(self.N)
|
||||
|
||||
#Wi(Ki + W) = Wi__Ki_W using the last K prior given to fit_full
|
||||
#dytil_dfhat_explicit = self.Wi__Ki_W
|
||||
#dytil_dfhat = dytil_dfhat_explicit + dytil_dfhat_implicit
|
||||
#dytil_dfhat1 = np.dot(self.Sigma_tilde, Ki) + np.eye(self.N) # or self.Wi__Ki_W? Theyre the same basically
|
||||
|
||||
a = mdot(dWi_dfhat, Ki, self.f_hat)
|
||||
dytil_dfhat = mdot(dWi_dfhat, Ki, self.f_hat) + np.dot(self.Sigma_tilde, Ki) + np.eye(self.N)
|
||||
return dL_dytil, dytil_dfhat
|
||||
|
||||
def _Kgradients(self, dL_d_K_Sigma, dK_dthetaK):
|
||||
|
|
@ -94,18 +109,18 @@ class Laplace(likelihood):
|
|||
"""
|
||||
dL_dytil, dytil_dfhat = self._shared_gradients_components()
|
||||
|
||||
|
||||
#dSigma_dfhat = -np.dot(self.Sigma_tilde, np.dot(d3phi_d3fhat, self.Sigma_tilde))
|
||||
|
||||
print "Computing K gradients"
|
||||
print "dytil_dfhat: ", np.mean(dytil_dfhat)
|
||||
I = np.eye(self.N)
|
||||
C = np.dot(self.K, self.W)
|
||||
A = I + C
|
||||
#print "Computing K gradients"
|
||||
#print "dytil_dfhat: ", np.mean(dytil_dfhat)
|
||||
#I = np.eye(self.N)
|
||||
#C = np.dot(self.K, self.W)
|
||||
#A = I + C
|
||||
#plt.imshow(A)
|
||||
#plt.show()
|
||||
|
||||
#I_KW_i, _, _, _ = pdinv(A) #FIXME: WHY SO MUCH JITTER?!
|
||||
#B = I + w12*K*w12
|
||||
I_KW_i = self.Bi # could use self.B_chol??
|
||||
|
||||
#FIXME: Careful dK_dthetaK is not the derivative with respect to the marginal just prior K!
|
||||
|
|
@ -113,15 +128,22 @@ class Laplace(likelihood):
|
|||
dfhat_dthetaK = np.zeros((self.f_hat.shape[0], dK_dthetaK.shape[0]))
|
||||
grad = self.likelihood_function.link_grad(self.data, self.f_hat, self.extra_data)
|
||||
for ind_j, thetaj in enumerate(dK_dthetaK):
|
||||
dfhat_dthetaK[:, ind_j] = np.dot(I_KW_i, np.dot(thetaj, grad))
|
||||
#dfhat_dthetaK[:, ind_j] = np.dot(thetaj, grad) - np.dot(self.K, np.dot(I_KW_i, np.dot(thetaj, grad)))
|
||||
dfhat_dthetaK[:, ind_j] = np.dot(I_KW_i, thetaj*grad)
|
||||
|
||||
print "dytil_dfhat: ", np.mean(dytil_dfhat), np.std(dytil_dfhat)
|
||||
print "dfhat_dthetaK: ", np.mean(dfhat_dthetaK), np.std(dfhat_dthetaK)
|
||||
dytil_dthetaK = np.dot(dytil_dfhat, dfhat_dthetaK) # should be (D,thetaK)
|
||||
print "dytil_dthetaK: ", np.mean(dytil_dthetaK), np.std(dytil_dthetaK)
|
||||
print "\n"
|
||||
|
||||
#FIXME: Careful the -D*0.5 in dL_d_K_sigma might need to be -0.5?
|
||||
dL_dSigma = dL_d_K_Sigma
|
||||
#d3phi_d3fhat = self.likelihood_function.d3link(self.data, self.f_hat, self.extra_data)
|
||||
#explicit #implicit
|
||||
#dSigmai_dthetaK = 0 + np.dot(d3phi_d3fhat, dfhat_dthetaK)
|
||||
#dSigma_dthetaK = np.zeros((self.f_hat.shape[0], self.f_hat.shape[0], dK_dthetaK.shape[0]))
|
||||
|
||||
d3likelihood_d3fhat = self.likelihood_function.d3link(self.data, self.f_hat, self.extra_data)
|
||||
Wi = np.diagonal(self.Sigma_tilde) #Convenience
|
||||
dSigma_dthetaK_explicit = 0
|
||||
|
|
@ -140,19 +162,16 @@ class Laplace(likelihood):
|
|||
dL_dthetaK_via_ytil = np.sum(np.dot(dL_dytil, dytil_dthetaK), axis=0)
|
||||
dL_dthetaK_via_Sigma = np.sum(np.dot(dL_dSigma, dSigma_dthetaK), axis=0)
|
||||
dL_dthetaK_implicit = dL_dthetaK_via_ytil + dL_dthetaK_via_Sigma
|
||||
#dL_dthetaK_implicit = np.dot(dL_dytil.T, dytil_dthetaK.T)
|
||||
|
||||
#print "\n"
|
||||
#print "dL_dytil: ", np.mean(dL_dytil)
|
||||
#print "dytil_dthetaK: ", np.mean(dytil_dthetaK)
|
||||
#print "dL_dthetaK_via_ytil: ", dL_dthetaK_via_ytil
|
||||
#print "\n"
|
||||
#print "dL_dSigma: ", np.mean(dL_dSigma)
|
||||
#print "dSigma_dthetaK: ", np.mean(dSigma_dthetaK)
|
||||
#print "dL_dthetaK_via_Sigma: ", dL_dthetaK_via_Sigma
|
||||
#print "\n"
|
||||
#print "dL_dthetaK_implicit: ", dL_dthetaK_implicit
|
||||
#import ipdb; ipdb.set_trace() # XXX BREAKPOINT
|
||||
print "dL_dytil: ", np.mean(dL_dytil), np.std(dL_dytil)
|
||||
print "dytil_dthetaK: ", np.mean(dytil_dthetaK), np.std(dytil_dthetaK)
|
||||
print "dL_dthetaK_via_ytil: ", dL_dthetaK_via_ytil
|
||||
print "\n"
|
||||
print "dL_dSigma: ", np.mean(dL_dSigma), np.std(dL_dSigma)
|
||||
print "dSigma_dthetaK: ", np.mean(dSigma_dthetaK), np.std(dSigma_dthetaK)
|
||||
print "dL_dthetaK_via_Sigma: ", dL_dthetaK_via_Sigma
|
||||
print "\n"
|
||||
print "dL_dthetaK_implicit: ", dL_dthetaK_implicit
|
||||
|
||||
return np.squeeze(dL_dthetaK_implicit)
|
||||
|
||||
|
|
@ -182,11 +201,15 @@ class Laplace(likelihood):
|
|||
dL_dytil, dytil_dfhat = self._shared_gradients_components()
|
||||
#dfhat_dthetaL, dSigmai_dthetaL = self.likelihood_function._gradients(self.data, self.f_hat, self.extra_data) #FIXME: Shouldn't this have a implicit component aswell?
|
||||
|
||||
dlikelihood_dthetaL_explicit, d2likelihood_dthetaL = self.likelihood_function._gradients(self.data, self.f_hat, self.extra_data) #FIXME: Shouldn't this have a implicit component aswell?
|
||||
dlikelihood_dfhat = self.likelihood_function.link_hess(self.data, self.f_hat, self.extra_data)
|
||||
dfhat_dthetaL_cyclic = 0 #what is this? how can dfhat_dthetaL be used in the value of itself?
|
||||
dlikelihood_dthetaL_implicit = np.dot(dlikelihood_dfhat, dfhat_dthetaL_cyclic) # may need a sum over f
|
||||
dfhat_dthetaL = np.dot(self.K, (dlikelihood_dthetaL_explicit + dlikelihood_dthetaL_implicit)[:, None])
|
||||
dlikelihood_dthetaL, d2likelihood_dthetaL = self.likelihood_function._gradients(self.data, self.f_hat, self.extra_data) #FIXME: Shouldn't this have a implicit component aswell?
|
||||
dlikelihood_dfhat = self.likelihood_function.link_grad(self.data, self.f_hat, self.extra_data)
|
||||
#dfhat_dthetaL_cyclic = 0 #FIXME: what is this? how can dfhat_dthetaL be used in the value of itself?
|
||||
#dlikelihood_dthetaL_implicit = np.dot(dlikelihood_dfhat, dfhat_dthetaL_cyclic) # may need a sum over f
|
||||
#dfhat_dthetaL = np.dot(self.K, (dlikelihood_dthetaL_explicit + dlikelihood_dthetaL_implicit)[:, None])
|
||||
#KW_I_i, _, _, _ = pdinv(np.dot(self.K, self.W) + np.eye(self.N))
|
||||
KW_I_i = self.Bi # could use self.B_chol??
|
||||
dfhat_dthetaL = mdot(KW_I_i, (self.K, dlikelihood_dfhat))
|
||||
|
||||
dytil_dthetaL = np.dot(dytil_dfhat, dfhat_dthetaL)
|
||||
|
||||
#FIXME: Careful the -D*0.5 in dL_d_K_sigma might need to be -0.5?
|
||||
|
|
@ -199,7 +222,7 @@ class Laplace(likelihood):
|
|||
|
||||
d3likelihood_d3fhat = self.likelihood_function.d3link(self.data, self.f_hat, self.extra_data)
|
||||
dWi_dfhat = np.diagflat(-1*Wi*(-1*d3likelihood_d3fhat)*Wi)
|
||||
dSigma_dthetaL_implicit = np.dot(dWi_dfhat, dfhat_dthetaL_cyclic)
|
||||
dSigma_dthetaL_implicit = np.dot(dWi_dfhat, dfhat_dthetaL)
|
||||
dSigma_dthetaL = dSigma_dthetaL_explicit + dSigma_dthetaL_implicit
|
||||
|
||||
#dSigmai_dthetaL = self.likelihood_function._gradients(self.data, self.f_hat, self.extra_data) #FIXME: Shouldn't this have a implicit component aswell?
|
||||
|
|
@ -219,8 +242,10 @@ class Laplace(likelihood):
|
|||
#dL_dthetaL = 0 + np.dot(dL_dytil, dytil_dthetaL)# + np.dot(dL_dSigma, dSigma_dthetaL)
|
||||
|
||||
dL_dthetaL_via_ytil = np.sum(np.dot(dL_dytil, dytil_dthetaL), axis=0)
|
||||
dL_dthetaL_via_Sigma = np.sum(np.dot(dL_dSigma, dSigma_dthetaL), axis=0)
|
||||
dL_dthetaL_via_Sigma = np.sum(np.dot(dL_dSigma[:, None].T, dSigma_dthetaL), axis=0)
|
||||
dL_dthetaL = dL_dthetaL_via_ytil + dL_dthetaL_via_Sigma
|
||||
dL_dthetaL_via_Sigma_old = np.sum(np.dot(dL_dSigma, dSigma_dthetaL), axis=0)
|
||||
import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
|
||||
|
||||
return np.squeeze(dL_dthetaL) #should be array of length *params-being optimized*, for student t just optimising 1 parameter, this is (1,)
|
||||
|
||||
|
|
@ -257,7 +282,7 @@ class Laplace(likelihood):
|
|||
#((L.T*w)_i + I)f_hat = y_tilde
|
||||
L = jitchol(self.K)
|
||||
Li = chol_inv(L)
|
||||
Lt_W = np.dot(L.T, self.W)
|
||||
Lt_W = np.dot(L.T, self.W) #FIXME: Can make Faster
|
||||
|
||||
##Check it isn't singular!
|
||||
if cond(Lt_W) > epsilon:
|
||||
|
|
@ -361,7 +386,6 @@ class Laplace(likelihood):
|
|||
"""
|
||||
#W is diagnoal so its sqrt is just the sqrt of the diagonal elements
|
||||
W_12 = np.sqrt(W)
|
||||
#import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
|
||||
B = np.eye(K.shape[0]) + np.dot(W_12, np.dot(K, W_12))
|
||||
L = jitchol(B)
|
||||
return (B, L, W_12)
|
||||
|
|
|
|||
|
|
@ -176,8 +176,6 @@ class student_t(likelihood_function):
|
|||
|
||||
def _set_params(self, x):
|
||||
self.sigma = float(x)
|
||||
print "Setting student t sigma: ", x
|
||||
print x
|
||||
#self.covariance_matrix = np.eye(self.N)*self._variance
|
||||
#self.precision = 1./self._variance
|
||||
|
||||
|
|
@ -288,7 +286,7 @@ class student_t(likelihood_function):
|
|||
f = np.squeeze(f)
|
||||
assert y.shape == f.shape
|
||||
e = y - f
|
||||
grad_sigma = ( (-2*self.sigma*self.v*(self.v + 1)*e)
|
||||
grad_sigma = ( (2*self.sigma*self.v*(self.v + 1)*e)
|
||||
/ ((self.v*(self.sigma**2) + e**2)**2)
|
||||
)
|
||||
return np.squeeze(grad_sigma)
|
||||
|
|
|
|||
|
|
@ -66,6 +66,10 @@ class GP(model):
|
|||
# self.likelihood._set_params(p[self.kern.Nparam:]) # test by Nicolas
|
||||
self.likelihood._set_params(p[self.kern.Nparam_transformed():]) # test by Nicolas
|
||||
|
||||
if isinstance(self.likelihood, Laplace):
|
||||
print "Updating approx: ", p
|
||||
self.likelihood.fit_full(self.kern.K(self.X))
|
||||
self.likelihood._set_params(self.likelihood._get_params())
|
||||
|
||||
self.K = self.kern.K(self.X)
|
||||
self.K += self.likelihood.covariance_matrix
|
||||
|
|
@ -87,14 +91,12 @@ class GP(model):
|
|||
return self.kern._get_param_names_transformed() + self.likelihood._get_param_names()
|
||||
|
||||
def _update_params_callback(self, p):
|
||||
#FIXME:Check the transforming
|
||||
#Set the new parameters of the kernel and likelihood within the optimization
|
||||
import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
|
||||
#parameters will be in transformed space
|
||||
self.kern._set_params_transformed(p[:self.kern.Nparam_transformed()])
|
||||
#set_params_transformed for likelihood doesn't exist?
|
||||
self.likelihood._set_params(p[self.kern.Nparam_transformed():])
|
||||
#update the likelihood approximation within the optimisation with the current parameters
|
||||
self.update_likelihood_approximation()
|
||||
import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
|
||||
|
||||
def update_likelihood_approximation(self):
|
||||
"""
|
||||
|
|
@ -123,7 +125,9 @@ class GP(model):
|
|||
model for a new variable Y* = v_tilde/tau_tilde, with a covariance
|
||||
matrix K* = K + diag(1./tau_tilde) plus a normalization term.
|
||||
"""
|
||||
return -0.5 * self.D * self.K_logdet + self._model_fit_term() + self.likelihood.Z
|
||||
l = -0.5 * self.D * self.K_logdet + self._model_fit_term() + self.likelihood.Z
|
||||
print "Log likelihood: ", l
|
||||
return l
|
||||
|
||||
def _log_likelihood_gradients(self):
|
||||
"""
|
||||
|
|
@ -135,7 +139,7 @@ class GP(model):
|
|||
if isinstance(self.likelihood, Laplace):
|
||||
dL_dthetaK_explicit = dL_dthetaK
|
||||
#Need to pass in a matrix of ones to get access to raw dK_dthetaK values without being chained
|
||||
fake_dL_dKs = np.eye(self.dL_dK.shape[0])
|
||||
fake_dL_dKs = np.eye(self.dL_dK.shape[0]) #FIXME: Check this is right...
|
||||
dK_dthetaK = self.kern.dK_dtheta(dL_dK=fake_dL_dKs, X=self.X)
|
||||
|
||||
#We need the dL_dK where K is equal to the prior K, not K+Sigma as is the case now
|
||||
|
|
@ -145,13 +149,11 @@ class GP(model):
|
|||
#print "dL_dthetaK_explicit: {dldkx} dL_dthetaK_implicit: {dldki} dL_dthetaK: {dldk}".format(dldkx=dL_dthetaK_explicit, dldki=dL_dthetaK_implicit, dldk=dL_dthetaK)
|
||||
|
||||
dL_dthetaL = self.likelihood._gradients(partial=np.diag(self.dL_dK))
|
||||
#print "dL_dthetaL: ", dL_dthetaL
|
||||
print "Stacked dL_dthetaK, dL_dthetaL: ", np.hstack((dL_dthetaK, dL_dthetaL))
|
||||
import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
|
||||
else:
|
||||
#print "dL_dthetaK: ", dL_dthetaK
|
||||
dL_dthetaL = self.likelihood._gradients(partial=np.diag(self.dL_dK))
|
||||
print "Stacked dL_dthetaK, dL_dthetaL: ", np.hstack((dL_dthetaK, dL_dthetaL))
|
||||
#print "dL_dthetaL: ", dL_dthetaL
|
||||
return np.hstack((dL_dthetaK, dL_dthetaL))
|
||||
#return np.hstack((self.kern.dK_dtheta(dL_dK=self.dL_dK, X=self.X), self.likelihood._gradients(partial=np.diag(self.dL_dK))))
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue