diff --git a/GPy/examples/laplace_approximations.py b/GPy/examples/laplace_approximations.py index 27f063dc..203d308d 100644 --- a/GPy/examples/laplace_approximations.py +++ b/GPy/examples/laplace_approximations.py @@ -37,9 +37,9 @@ def timing(): def debug_student_t_noise_approx(): plot = False - real_var = 0.1 + real_var = 0.4 #Start a function, any function - X = np.linspace(0.0, 10.0, 2)[:, None] + X = np.linspace(0.0, 10.0, 100)[:, None] Y = np.sin(X) + np.random.randn(*X.shape)*real_var X_full = np.linspace(0.0, 10.0, 500)[:, None] @@ -89,12 +89,12 @@ def debug_student_t_noise_approx(): stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, rasm=True) m = GPy.models.GP(X, stu_t_likelihood, kernel6) #m.constrain_positive('rbf') - m.constrain_fixed('rbf_v', 1.0898) - m.constrain_fixed('rbf_l', 1.8651) + #m.constrain_fixed('rbf_v', 1.0898) + #m.constrain_fixed('rbf_l', 1.8651) m.constrain_positive('t_noi') #m.constrain_fixed('t_noise_variance', real_sd) m.update_likelihood_approximation() - m.optimize('scg', messages=True) + m.optimize(messages=True) print(m) return m #m.optimize('lbfgsb', messages=True, callback=m._update_params_callback) diff --git a/GPy/likelihoods/Laplace.py b/GPy/likelihoods/Laplace.py index f8ba25f1..85af82f9 100644 --- a/GPy/likelihoods/Laplace.py +++ b/GPy/likelihoods/Laplace.py @@ -79,41 +79,54 @@ class Laplace(likelihood): return (self._Kgradients(dL_d_K_Sigma, dK_dthetaK), self._gradients(dL_d_K_Sigma)) def _shared_gradients_components(self): + #FIXME: Careful of side effects! And make sure W and K are up to date! Ki, _, _, _ = pdinv(self.K) - Ki_W_i = inv(Ki + self.W) #Do it non numerically stable for now d3lik_d3fhat = self.likelihood_function.d3lik_d3f(self.data, self.f_hat) - dL_dfhat = -0.5*np.dot(np.diag(Ki_W_i), d3lik_d3fhat) - KW = np.dot(self.K, self.W) - I_KW_i = inv(np.eye(KW.shape[0]) + KW) - return dL_dfhat, Ki, I_KW_i + #dL_dfhat = -0.5*np.diag(self.Ki_W_i)*d3lik_d3fhat + dL_dfhat = -0.5*(np.diag(self.Ki_W_i)*d3lik_d3fhat)[:, None] + Wi_K_i = mdot(self.W_12, self.Bi, self.W_12) #same as rasms R + I_KW_i = np.eye(self.N) - np.dot(self.K, Wi_K_i) + return dL_dfhat, Ki, I_KW_i, Wi_K_i def _Kgradients(self, dL_d_K_Sigma, dK_dthetaK): """ Gradients with respect to prior kernel parameters """ - dL_dfhat, Ki, I_KW_i = self._shared_gradients_components() - K_Wi_i = inv(self.K + inv(self.W)) - dlp = self.likelihood_function.dlik_df(self.data, self.f_hat) + dL_dfhat, Ki, I_KW_i, Wi_K_i = self._shared_gradients_components() + dlp = self.likelihood_function.dlik_df(self.data, self.f_hat)[:, None] dL_dthetaK = np.zeros(dK_dthetaK.shape) for thetaK_i, dK_dthetaK_i in enumerate(dK_dthetaK): #Explicit - dL_dthetaK[thetaK_i] = 0.5*mdot(self.f_hat.T, Ki, dK_dthetaK_i, Ki, self.f_hat) - 0.5*np.trace(np.dot(K_Wi_i, dK_dthetaK_i)) + dL_dthetaK[thetaK_i] = 0.5*mdot(self.f_hat.T, Ki, dK_dthetaK_i, Ki, self.f_hat) - 0.5*np.trace(Wi_K_i*dK_dthetaK_i) #Implicit - df_hat_dthetaK = mdot(I_KW_i, dK_dthetaK, dlp) + df_hat_dthetaK = mdot(I_KW_i, dK_dthetaK_i, dlp) dL_dthetaK[thetaK_i] += np.dot(dL_dfhat.T, df_hat_dthetaK) - return dL_dthetaK + return np.squeeze(dL_dthetaK) def _gradients(self, partial): """ Gradients with respect to likelihood parameters """ - dL_dfhat, Ki, I_KW_i = self._shared_gradients_components() + dL_dfhat, Ki, I_KW_i, Wi_K_i = self._shared_gradients_components() dlik_dthetaL, dlik_grad_dthetaL, dlik_hess_dthetaL = self.likelihood_function._gradients(self.data, self.f_hat) - dL_dthetaL = np.zeros(dlik_dthetaL.shape) - return dL_dthetaL #should be array of length *params-being optimized*, for student t just optimising 1 parameter, this is (1,) + num_params = len(dlik_dthetaL) + #Ki_W_i = np.diag(inv(Ki + self.W))[:, None] + dL_dthetaL = np.zeros((1, num_params)) # make space for one derivative for each likelihood parameter + for thetaL_i in range(num_params): + #Explicit + #dL_dthetaL[thetaL_i] = np.sum(dlik_dthetaL[thetaL_i]) - 0.5*np.trace(np.dot(Ki_W_i.T, np.diagflat(dlik_hess_dthetaL[thetaL_i]))) + #dL_dthetaL[thetaL_i] = np.sum(dlik_dthetaL[thetaL_i]) + 0.5*np.dot(Ki_W_i.T, dlik_hess_dthetaL[thetaL_i][:, None]) + # might be + + dL_dthetaL[thetaL_i] = np.sum(dlik_dthetaL[thetaL_i]) - 0.5*np.dot(np.diag(self.Ki_W_i), dlik_hess_dthetaL[thetaL_i]) + #Implicit + df_hat_dthetaL = mdot(I_KW_i, self.K, dlik_grad_dthetaL[thetaL_i]) + import ipdb; ipdb.set_trace() ### XXX BREAKPOINT + dL_dthetaL[thetaL_i] += np.dot(dL_dfhat.T, df_hat_dthetaL) + + return np.squeeze(dL_dthetaL) #should be array of length *params-being optimized*, for student t just optimising 1 parameter, this is (1,) def _compute_GP_variables(self): """ @@ -232,8 +245,8 @@ class Laplace(likelihood): self.B, self.B_chol, self.W_12 = self._compute_B_statistics(self.K, self.W) self.Bi, _, _, B_det = pdinv(self.B) - Ki_W_i = self.K - mdot(self.K, self.W_12, self.Bi, self.W_12, self.K) - self.ln_Ki_W_i_det = np.linalg.det(Ki_W_i) + self.Ki_W_i = self.K - mdot(self.K, self.W_12, self.Bi, self.W_12, self.K) + self.ln_Ki_W_i_det = np.linalg.det(self.Ki_W_i) b = np.dot(self.W, self.f_hat) + self.likelihood_function.dlik_df(self.data, self.f_hat, extra_data=self.extra_data)[:, None] solve_chol = cho_solve((self.B_chol, True), mdot(self.W_12, (self.K, b))) diff --git a/GPy/likelihoods/likelihood_functions.py b/GPy/likelihoods/likelihood_functions.py index d75e7218..c6186137 100644 --- a/GPy/likelihoods/likelihood_functions.py +++ b/GPy/likelihoods/likelihood_functions.py @@ -302,12 +302,13 @@ class student_t(likelihood_function): f = np.squeeze(f) assert y.shape == f.shape e = y - f - dlik_hess_dsigma = ( ((v + 1)*(e**2 - (self.sigma**2)*self.v)) / + dlik_hess_dsigma = ( ((self.v + 1)*(e**2 - (self.sigma**2)*self.v)) / ((e**2 + (self.sigma**2)*self.v)**2) ) - return np.squeeze(dlik_hess_dsigma) + return dlik_hess_dsigma def _gradients(self, y, f, extra_data=None): + #must be listed in same order as 'get_param_names' derivs = ([self.link_dstd(y, f, extra_data=extra_data)], [self.dlik_df_dstd(y, f, extra_data=extra_data)], [self.d2lik_d2f_dstd(y, f, extra_data=extra_data)] diff --git a/GPy/models/GP.py b/GPy/models/GP.py index 0b5a8db6..9ce83a5a 100644 --- a/GPy/models/GP.py +++ b/GPy/models/GP.py @@ -69,7 +69,6 @@ class GP(model): self.likelihood._set_params(p[self.kern.Nparam_transformed():]) # test by Nicolas if isinstance(self.likelihood, Laplace): - print "Updating approx: ", p self.likelihood.fit_full(self.kern.K(self.X)) self.likelihood._set_params(self.likelihood._get_params()) @@ -134,7 +133,6 @@ class GP(model): matrix K* = K + diag(1./tau_tilde) plus a normalization term. """ l = -0.5 * self.D * self.K_logdet + self._model_fit_term() + self.likelihood.Z - print "Log likelihood: ", l return l def _log_likelihood_gradients(self): @@ -145,17 +143,16 @@ class GP(model): """ dL_dthetaK = self.kern.dK_dtheta(dL_dK=self.dL_dK, X=self.X) if isinstance(self.likelihood, Laplace): - dL_dthetaK_explicit = dL_dthetaK #Need to pass in a matrix of ones to get access to raw dK_dthetaK values without being chained fake_dL_dKs = np.ones(self.dL_dK.shape) #FIXME: Check this is right... dK_dthetaK = self.kern.dK_dtheta(dL_dK=fake_dL_dKs, X=self.X) dL_dthetaK = self.likelihood._Kgradients(dL_d_K_Sigma=self.dL_dK, dK_dthetaK=dK_dthetaK) dL_dthetaL = self.likelihood._gradients(partial=np.diag(self.dL_dK)) - print "Stacked dL_dthetaK, dL_dthetaL: ", np.hstack((dL_dthetaK, dL_dthetaL)) + #print "Stacked dL_dthetaK, dL_dthetaL: ", np.hstack((dL_dthetaK, dL_dthetaL)) else: dL_dthetaL = self.likelihood._gradients(partial=np.diag(self.dL_dK)) - print "Stacked dL_dthetaK, dL_dthetaL: ", np.hstack((dL_dthetaK, dL_dthetaL)) + #print "Stacked dL_dthetaK, dL_dthetaL: ", np.hstack((dL_dthetaK, dL_dthetaL)) return np.hstack((dL_dthetaK, dL_dthetaL)) #return np.hstack((self.kern.dK_dtheta(dL_dK=self.dL_dK, X=self.X), self.likelihood._gradients(partial=np.diag(self.dL_dK))))