diff --git a/GPy/examples/laplace_approximations.py b/GPy/examples/laplace_approximations.py index 57ae9be7..2054881c 100644 --- a/GPy/examples/laplace_approximations.py +++ b/GPy/examples/laplace_approximations.py @@ -52,7 +52,7 @@ def debug_student_t_noise_approx(): real_sd = np.sqrt(real_var) print "Real noise: ", real_sd - initial_var_guess = 0.01 + initial_var_guess = 1 #t_rv = t(deg_free, loc=0, scale=real_var) #noise = t_rvrvs(size=Y.shape) #Y += noise @@ -84,14 +84,21 @@ def debug_student_t_noise_approx(): edited_real_sd = initial_var_guess #real_sd + import ipdb; ipdb.set_trace() ### XXX BREAKPOINT print "Clean student t, rasm" t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd) stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, rasm=True) m = GPy.models.GP(X, stu_t_likelihood, kernel6) - m.ensure_default_constraints() + #m.constrain_positive('rbf') + m.constrain_fixed('rbf_v', 1.0898) + m.constrain_fixed('rbf_l', 1.8651) + m.constrain_positive('t_noi') + #m.constrain_fixed('t_noise_variance', real_sd) m.update_likelihood_approximation() - m.optimize() + #m.optimize('lbfgsb', messages=True, callback=m._update_params_callback) + m.optimize('scg', messages=True) print(m) + return m if plot: plt.suptitle('Student-t likelihood') plt.subplot(132) @@ -99,19 +106,19 @@ def debug_student_t_noise_approx(): plt.plot(X_full, Y_full) plt.ylim(-2.5, 2.5) - print "Clean student t, ncg" - t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd) - stu_t_likelihood = GPy.likelihoods.Laplace(Y, t_distribution, rasm=False) - m = GPy.models.GP(X, stu_t_likelihood, kernel3) - m.ensure_default_constraints() - m.update_likelihood_approximation() - m.optimize() - print(m) - if plot: - plt.subplot(133) - m.plot() - plt.plot(X_full, Y_full) - plt.ylim(-2.5, 2.5) + #print "Clean student t, ncg" + #t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd) + #stu_t_likelihood = GPy.likelihoods.Laplace(Y, t_distribution, rasm=False) + #m = GPy.models.GP(X, stu_t_likelihood, kernel3) + #m.ensure_default_constraints() + #m.update_likelihood_approximation() + #m.optimize() + #print(m) + #if plot: + #plt.subplot(133) + #m.plot() + #plt.plot(X_full, Y_full) + #plt.ylim(-2.5, 2.5) #plt.show() diff --git a/GPy/likelihoods/Laplace.py b/GPy/likelihoods/Laplace.py index 208b1102..5b3e8f43 100644 --- a/GPy/likelihoods/Laplace.py +++ b/GPy/likelihoods/Laplace.py @@ -63,6 +63,7 @@ class Laplace(likelihood): return self.likelihood_function._get_param_names() def _set_params(self, p): + #print "Setting laplace param with: ", p return self.likelihood_function._set_params(p) def both_gradients(self, dL_d_K_Sigma, dK_dthetaK): @@ -78,10 +79,24 @@ class Laplace(likelihood): return (self._Kgradients(dL_d_K_Sigma, dK_dthetaK), self._gradients(dL_d_K_Sigma)) def _shared_gradients_components(self): - dL_dytil = -np.dot(self.Y.T, (self.K+self.Sigma_tilde)) #or *0.5? Shouldn't this be -y*R - dytil_dfhat = self.Wi__Ki_W # np.dot(self.Sigma_tilde, self.Ki) + np.eye(self.N) # or self.Wi__Ki_W? - #Ki, _, _, _ = pdinv(self.K) - #dytil_dfhat = np.dot(self.Sigma_tilde, Ki) + np.eye(self.N) # or self.Wi__Ki_W? + dL_dytil = -np.dot(self.Y.T, inv(self.K+self.Sigma_tilde)) #or *0.5? Shouldn't this be -y*R + + d3likelihood_d3fhat = self.likelihood_function.d3link(self.data, self.f_hat, self.extra_data) + Wi = np.diagonal(self.Sigma_tilde) #Convenience + #Can just hadamard product as diagonal matricies multiplied are just multiplying elements + dWi_dfhat = np.diagflat(-1*Wi*(-1*d3likelihood_d3fhat)*Wi) + + Ki, _, _, _ = pdinv(self.K) + #dytil_dfhat_implicit = np.dot(dWi_dfhat, Ki) + np.eye(self.N) + #dytil_dfhat = np.dot(dWi_dfhat, Ki) + np.eye(self.N) + + #Wi(Ki + W) = Wi__Ki_W using the last K prior given to fit_full + #dytil_dfhat_explicit = self.Wi__Ki_W + #dytil_dfhat = dytil_dfhat_explicit + dytil_dfhat_implicit + #dytil_dfhat1 = np.dot(self.Sigma_tilde, Ki) + np.eye(self.N) # or self.Wi__Ki_W? Theyre the same basically + + a = mdot(dWi_dfhat, Ki, self.f_hat) + dytil_dfhat = mdot(dWi_dfhat, Ki, self.f_hat) + np.dot(self.Sigma_tilde, Ki) + np.eye(self.N) return dL_dytil, dytil_dfhat def _Kgradients(self, dL_d_K_Sigma, dK_dthetaK): @@ -94,18 +109,18 @@ class Laplace(likelihood): """ dL_dytil, dytil_dfhat = self._shared_gradients_components() - #dSigma_dfhat = -np.dot(self.Sigma_tilde, np.dot(d3phi_d3fhat, self.Sigma_tilde)) - print "Computing K gradients" - print "dytil_dfhat: ", np.mean(dytil_dfhat) - I = np.eye(self.N) - C = np.dot(self.K, self.W) - A = I + C + #print "Computing K gradients" + #print "dytil_dfhat: ", np.mean(dytil_dfhat) + #I = np.eye(self.N) + #C = np.dot(self.K, self.W) + #A = I + C #plt.imshow(A) #plt.show() #I_KW_i, _, _, _ = pdinv(A) #FIXME: WHY SO MUCH JITTER?! + #B = I + w12*K*w12 I_KW_i = self.Bi # could use self.B_chol?? #FIXME: Careful dK_dthetaK is not the derivative with respect to the marginal just prior K! @@ -113,15 +128,22 @@ class Laplace(likelihood): dfhat_dthetaK = np.zeros((self.f_hat.shape[0], dK_dthetaK.shape[0])) grad = self.likelihood_function.link_grad(self.data, self.f_hat, self.extra_data) for ind_j, thetaj in enumerate(dK_dthetaK): - dfhat_dthetaK[:, ind_j] = np.dot(I_KW_i, np.dot(thetaj, grad)) + #dfhat_dthetaK[:, ind_j] = np.dot(thetaj, grad) - np.dot(self.K, np.dot(I_KW_i, np.dot(thetaj, grad))) + dfhat_dthetaK[:, ind_j] = np.dot(I_KW_i, thetaj*grad) + print "dytil_dfhat: ", np.mean(dytil_dfhat), np.std(dytil_dfhat) + print "dfhat_dthetaK: ", np.mean(dfhat_dthetaK), np.std(dfhat_dthetaK) dytil_dthetaK = np.dot(dytil_dfhat, dfhat_dthetaK) # should be (D,thetaK) + print "dytil_dthetaK: ", np.mean(dytil_dthetaK), np.std(dytil_dthetaK) + print "\n" + #FIXME: Careful the -D*0.5 in dL_d_K_sigma might need to be -0.5? dL_dSigma = dL_d_K_Sigma #d3phi_d3fhat = self.likelihood_function.d3link(self.data, self.f_hat, self.extra_data) #explicit #implicit #dSigmai_dthetaK = 0 + np.dot(d3phi_d3fhat, dfhat_dthetaK) #dSigma_dthetaK = np.zeros((self.f_hat.shape[0], self.f_hat.shape[0], dK_dthetaK.shape[0])) + d3likelihood_d3fhat = self.likelihood_function.d3link(self.data, self.f_hat, self.extra_data) Wi = np.diagonal(self.Sigma_tilde) #Convenience dSigma_dthetaK_explicit = 0 @@ -140,19 +162,16 @@ class Laplace(likelihood): dL_dthetaK_via_ytil = np.sum(np.dot(dL_dytil, dytil_dthetaK), axis=0) dL_dthetaK_via_Sigma = np.sum(np.dot(dL_dSigma, dSigma_dthetaK), axis=0) dL_dthetaK_implicit = dL_dthetaK_via_ytil + dL_dthetaK_via_Sigma - #dL_dthetaK_implicit = np.dot(dL_dytil.T, dytil_dthetaK.T) - #print "\n" - #print "dL_dytil: ", np.mean(dL_dytil) - #print "dytil_dthetaK: ", np.mean(dytil_dthetaK) - #print "dL_dthetaK_via_ytil: ", dL_dthetaK_via_ytil - #print "\n" - #print "dL_dSigma: ", np.mean(dL_dSigma) - #print "dSigma_dthetaK: ", np.mean(dSigma_dthetaK) - #print "dL_dthetaK_via_Sigma: ", dL_dthetaK_via_Sigma - #print "\n" - #print "dL_dthetaK_implicit: ", dL_dthetaK_implicit - #import ipdb; ipdb.set_trace() # XXX BREAKPOINT + print "dL_dytil: ", np.mean(dL_dytil), np.std(dL_dytil) + print "dytil_dthetaK: ", np.mean(dytil_dthetaK), np.std(dytil_dthetaK) + print "dL_dthetaK_via_ytil: ", dL_dthetaK_via_ytil + print "\n" + print "dL_dSigma: ", np.mean(dL_dSigma), np.std(dL_dSigma) + print "dSigma_dthetaK: ", np.mean(dSigma_dthetaK), np.std(dSigma_dthetaK) + print "dL_dthetaK_via_Sigma: ", dL_dthetaK_via_Sigma + print "\n" + print "dL_dthetaK_implicit: ", dL_dthetaK_implicit return np.squeeze(dL_dthetaK_implicit) @@ -182,11 +201,15 @@ class Laplace(likelihood): dL_dytil, dytil_dfhat = self._shared_gradients_components() #dfhat_dthetaL, dSigmai_dthetaL = self.likelihood_function._gradients(self.data, self.f_hat, self.extra_data) #FIXME: Shouldn't this have a implicit component aswell? - dlikelihood_dthetaL_explicit, d2likelihood_dthetaL = self.likelihood_function._gradients(self.data, self.f_hat, self.extra_data) #FIXME: Shouldn't this have a implicit component aswell? - dlikelihood_dfhat = self.likelihood_function.link_hess(self.data, self.f_hat, self.extra_data) - dfhat_dthetaL_cyclic = 0 #what is this? how can dfhat_dthetaL be used in the value of itself? - dlikelihood_dthetaL_implicit = np.dot(dlikelihood_dfhat, dfhat_dthetaL_cyclic) # may need a sum over f - dfhat_dthetaL = np.dot(self.K, (dlikelihood_dthetaL_explicit + dlikelihood_dthetaL_implicit)[:, None]) + dlikelihood_dthetaL, d2likelihood_dthetaL = self.likelihood_function._gradients(self.data, self.f_hat, self.extra_data) #FIXME: Shouldn't this have a implicit component aswell? + dlikelihood_dfhat = self.likelihood_function.link_grad(self.data, self.f_hat, self.extra_data) + #dfhat_dthetaL_cyclic = 0 #FIXME: what is this? how can dfhat_dthetaL be used in the value of itself? + #dlikelihood_dthetaL_implicit = np.dot(dlikelihood_dfhat, dfhat_dthetaL_cyclic) # may need a sum over f + #dfhat_dthetaL = np.dot(self.K, (dlikelihood_dthetaL_explicit + dlikelihood_dthetaL_implicit)[:, None]) + #KW_I_i, _, _, _ = pdinv(np.dot(self.K, self.W) + np.eye(self.N)) + KW_I_i = self.Bi # could use self.B_chol?? + dfhat_dthetaL = mdot(KW_I_i, (self.K, dlikelihood_dfhat)) + dytil_dthetaL = np.dot(dytil_dfhat, dfhat_dthetaL) #FIXME: Careful the -D*0.5 in dL_d_K_sigma might need to be -0.5? @@ -199,7 +222,7 @@ class Laplace(likelihood): d3likelihood_d3fhat = self.likelihood_function.d3link(self.data, self.f_hat, self.extra_data) dWi_dfhat = np.diagflat(-1*Wi*(-1*d3likelihood_d3fhat)*Wi) - dSigma_dthetaL_implicit = np.dot(dWi_dfhat, dfhat_dthetaL_cyclic) + dSigma_dthetaL_implicit = np.dot(dWi_dfhat, dfhat_dthetaL) dSigma_dthetaL = dSigma_dthetaL_explicit + dSigma_dthetaL_implicit #dSigmai_dthetaL = self.likelihood_function._gradients(self.data, self.f_hat, self.extra_data) #FIXME: Shouldn't this have a implicit component aswell? @@ -219,8 +242,10 @@ class Laplace(likelihood): #dL_dthetaL = 0 + np.dot(dL_dytil, dytil_dthetaL)# + np.dot(dL_dSigma, dSigma_dthetaL) dL_dthetaL_via_ytil = np.sum(np.dot(dL_dytil, dytil_dthetaL), axis=0) - dL_dthetaL_via_Sigma = np.sum(np.dot(dL_dSigma, dSigma_dthetaL), axis=0) + dL_dthetaL_via_Sigma = np.sum(np.dot(dL_dSigma[:, None].T, dSigma_dthetaL), axis=0) dL_dthetaL = dL_dthetaL_via_ytil + dL_dthetaL_via_Sigma + dL_dthetaL_via_Sigma_old = np.sum(np.dot(dL_dSigma, dSigma_dthetaL), axis=0) + import ipdb; ipdb.set_trace() ### XXX BREAKPOINT return np.squeeze(dL_dthetaL) #should be array of length *params-being optimized*, for student t just optimising 1 parameter, this is (1,) @@ -257,7 +282,7 @@ class Laplace(likelihood): #((L.T*w)_i + I)f_hat = y_tilde L = jitchol(self.K) Li = chol_inv(L) - Lt_W = np.dot(L.T, self.W) + Lt_W = np.dot(L.T, self.W) #FIXME: Can make Faster ##Check it isn't singular! if cond(Lt_W) > epsilon: @@ -361,7 +386,6 @@ class Laplace(likelihood): """ #W is diagnoal so its sqrt is just the sqrt of the diagonal elements W_12 = np.sqrt(W) - #import ipdb; ipdb.set_trace() ### XXX BREAKPOINT B = np.eye(K.shape[0]) + np.dot(W_12, np.dot(K, W_12)) L = jitchol(B) return (B, L, W_12) diff --git a/GPy/likelihoods/likelihood_functions.py b/GPy/likelihoods/likelihood_functions.py index 61c79385..6eef9f33 100644 --- a/GPy/likelihoods/likelihood_functions.py +++ b/GPy/likelihoods/likelihood_functions.py @@ -176,8 +176,6 @@ class student_t(likelihood_function): def _set_params(self, x): self.sigma = float(x) - print "Setting student t sigma: ", x - print x #self.covariance_matrix = np.eye(self.N)*self._variance #self.precision = 1./self._variance @@ -288,7 +286,7 @@ class student_t(likelihood_function): f = np.squeeze(f) assert y.shape == f.shape e = y - f - grad_sigma = ( (-2*self.sigma*self.v*(self.v + 1)*e) + grad_sigma = ( (2*self.sigma*self.v*(self.v + 1)*e) / ((self.v*(self.sigma**2) + e**2)**2) ) return np.squeeze(grad_sigma) diff --git a/GPy/models/GP.py b/GPy/models/GP.py index 79284b59..ff852766 100644 --- a/GPy/models/GP.py +++ b/GPy/models/GP.py @@ -66,6 +66,10 @@ class GP(model): # self.likelihood._set_params(p[self.kern.Nparam:]) # test by Nicolas self.likelihood._set_params(p[self.kern.Nparam_transformed():]) # test by Nicolas + if isinstance(self.likelihood, Laplace): + print "Updating approx: ", p + self.likelihood.fit_full(self.kern.K(self.X)) + self.likelihood._set_params(self.likelihood._get_params()) self.K = self.kern.K(self.X) self.K += self.likelihood.covariance_matrix @@ -87,14 +91,12 @@ class GP(model): return self.kern._get_param_names_transformed() + self.likelihood._get_param_names() def _update_params_callback(self, p): - #FIXME:Check the transforming - #Set the new parameters of the kernel and likelihood within the optimization - import ipdb; ipdb.set_trace() ### XXX BREAKPOINT + #parameters will be in transformed space self.kern._set_params_transformed(p[:self.kern.Nparam_transformed()]) + #set_params_transformed for likelihood doesn't exist? self.likelihood._set_params(p[self.kern.Nparam_transformed():]) #update the likelihood approximation within the optimisation with the current parameters self.update_likelihood_approximation() - import ipdb; ipdb.set_trace() ### XXX BREAKPOINT def update_likelihood_approximation(self): """ @@ -123,7 +125,9 @@ class GP(model): model for a new variable Y* = v_tilde/tau_tilde, with a covariance matrix K* = K + diag(1./tau_tilde) plus a normalization term. """ - return -0.5 * self.D * self.K_logdet + self._model_fit_term() + self.likelihood.Z + l = -0.5 * self.D * self.K_logdet + self._model_fit_term() + self.likelihood.Z + print "Log likelihood: ", l + return l def _log_likelihood_gradients(self): """ @@ -135,7 +139,7 @@ class GP(model): if isinstance(self.likelihood, Laplace): dL_dthetaK_explicit = dL_dthetaK #Need to pass in a matrix of ones to get access to raw dK_dthetaK values without being chained - fake_dL_dKs = np.eye(self.dL_dK.shape[0]) + fake_dL_dKs = np.eye(self.dL_dK.shape[0]) #FIXME: Check this is right... dK_dthetaK = self.kern.dK_dtheta(dL_dK=fake_dL_dKs, X=self.X) #We need the dL_dK where K is equal to the prior K, not K+Sigma as is the case now @@ -145,13 +149,11 @@ class GP(model): #print "dL_dthetaK_explicit: {dldkx} dL_dthetaK_implicit: {dldki} dL_dthetaK: {dldk}".format(dldkx=dL_dthetaK_explicit, dldki=dL_dthetaK_implicit, dldk=dL_dthetaK) dL_dthetaL = self.likelihood._gradients(partial=np.diag(self.dL_dK)) + #print "dL_dthetaL: ", dL_dthetaL print "Stacked dL_dthetaK, dL_dthetaL: ", np.hstack((dL_dthetaK, dL_dthetaL)) - import ipdb; ipdb.set_trace() ### XXX BREAKPOINT else: - #print "dL_dthetaK: ", dL_dthetaK dL_dthetaL = self.likelihood._gradients(partial=np.diag(self.dL_dK)) print "Stacked dL_dthetaK, dL_dthetaL: ", np.hstack((dL_dthetaK, dL_dthetaL)) - #print "dL_dthetaL: ", dL_dthetaL return np.hstack((dL_dthetaK, dL_dthetaL)) #return np.hstack((self.kern.dK_dtheta(dL_dK=self.dL_dK, X=self.X), self.likelihood._gradients(partial=np.diag(self.dL_dK))))